postp_combined-m3xdjffh5v / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
75ffc0d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 9940,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005030181086519115,
"grad_norm": 7.163974285125732,
"learning_rate": 4.024144869215292e-06,
"loss": 0.8528,
"step": 10
},
{
"epoch": 0.01006036217303823,
"grad_norm": 5.244224548339844,
"learning_rate": 8.048289738430584e-06,
"loss": 0.7448,
"step": 20
},
{
"epoch": 0.015090543259557344,
"grad_norm": 2.7584030628204346,
"learning_rate": 1.2072434607645876e-05,
"loss": 0.5215,
"step": 30
},
{
"epoch": 0.02012072434607646,
"grad_norm": 2.765652656555176,
"learning_rate": 1.609657947686117e-05,
"loss": 0.3683,
"step": 40
},
{
"epoch": 0.025150905432595575,
"grad_norm": 2.0063560009002686,
"learning_rate": 2.012072434607646e-05,
"loss": 0.2833,
"step": 50
},
{
"epoch": 0.030181086519114688,
"grad_norm": 3.22554087638855,
"learning_rate": 2.4144869215291752e-05,
"loss": 0.2435,
"step": 60
},
{
"epoch": 0.035211267605633804,
"grad_norm": 1.5418182611465454,
"learning_rate": 2.8169014084507046e-05,
"loss": 0.188,
"step": 70
},
{
"epoch": 0.04024144869215292,
"grad_norm": 7.7757697105407715,
"learning_rate": 3.219315895372234e-05,
"loss": 0.1598,
"step": 80
},
{
"epoch": 0.04527162977867203,
"grad_norm": 2.0921378135681152,
"learning_rate": 3.621730382293763e-05,
"loss": 0.1485,
"step": 90
},
{
"epoch": 0.05030181086519115,
"grad_norm": 1.6707618236541748,
"learning_rate": 4.024144869215292e-05,
"loss": 0.1427,
"step": 100
},
{
"epoch": 0.05533199195171026,
"grad_norm": 2.270603895187378,
"learning_rate": 4.426559356136821e-05,
"loss": 0.1314,
"step": 110
},
{
"epoch": 0.060362173038229376,
"grad_norm": 1.4634528160095215,
"learning_rate": 4.8289738430583503e-05,
"loss": 0.1273,
"step": 120
},
{
"epoch": 0.06539235412474849,
"grad_norm": 1.9672114849090576,
"learning_rate": 5.2313883299798795e-05,
"loss": 0.0889,
"step": 130
},
{
"epoch": 0.07042253521126761,
"grad_norm": 1.466834545135498,
"learning_rate": 5.633802816901409e-05,
"loss": 0.1037,
"step": 140
},
{
"epoch": 0.07545271629778671,
"grad_norm": 1.9027611017227173,
"learning_rate": 6.036217303822938e-05,
"loss": 0.1027,
"step": 150
},
{
"epoch": 0.08048289738430583,
"grad_norm": 0.8662084937095642,
"learning_rate": 6.438631790744468e-05,
"loss": 0.0917,
"step": 160
},
{
"epoch": 0.08551307847082495,
"grad_norm": 2.0497639179229736,
"learning_rate": 6.841046277665996e-05,
"loss": 0.0964,
"step": 170
},
{
"epoch": 0.09054325955734406,
"grad_norm": 1.6987979412078857,
"learning_rate": 7.243460764587526e-05,
"loss": 0.0868,
"step": 180
},
{
"epoch": 0.09557344064386318,
"grad_norm": 0.845194935798645,
"learning_rate": 7.645875251509054e-05,
"loss": 0.1019,
"step": 190
},
{
"epoch": 0.1006036217303823,
"grad_norm": 1.4700064659118652,
"learning_rate": 8.048289738430584e-05,
"loss": 0.1007,
"step": 200
},
{
"epoch": 0.1056338028169014,
"grad_norm": 1.8693934679031372,
"learning_rate": 8.450704225352113e-05,
"loss": 0.0897,
"step": 210
},
{
"epoch": 0.11066398390342053,
"grad_norm": 1.1521185636520386,
"learning_rate": 8.853118712273642e-05,
"loss": 0.1064,
"step": 220
},
{
"epoch": 0.11569416498993963,
"grad_norm": 1.094040036201477,
"learning_rate": 9.255533199195171e-05,
"loss": 0.0706,
"step": 230
},
{
"epoch": 0.12072434607645875,
"grad_norm": 1.2210173606872559,
"learning_rate": 9.657947686116701e-05,
"loss": 0.078,
"step": 240
},
{
"epoch": 0.12575452716297786,
"grad_norm": 0.688697874546051,
"learning_rate": 0.00010060362173038229,
"loss": 0.0685,
"step": 250
},
{
"epoch": 0.13078470824949698,
"grad_norm": 0.9590277075767517,
"learning_rate": 0.00010462776659959759,
"loss": 0.0799,
"step": 260
},
{
"epoch": 0.1358148893360161,
"grad_norm": 0.8572778105735779,
"learning_rate": 0.00010865191146881289,
"loss": 0.0705,
"step": 270
},
{
"epoch": 0.14084507042253522,
"grad_norm": 0.8274833559989929,
"learning_rate": 0.00011267605633802819,
"loss": 0.0887,
"step": 280
},
{
"epoch": 0.14587525150905434,
"grad_norm": 0.9543363451957703,
"learning_rate": 0.00011670020120724347,
"loss": 0.0618,
"step": 290
},
{
"epoch": 0.15090543259557343,
"grad_norm": 0.7259362936019897,
"learning_rate": 0.00012072434607645876,
"loss": 0.075,
"step": 300
},
{
"epoch": 0.15593561368209255,
"grad_norm": 0.8904047012329102,
"learning_rate": 0.00012474849094567405,
"loss": 0.07,
"step": 310
},
{
"epoch": 0.16096579476861167,
"grad_norm": 0.7555816173553467,
"learning_rate": 0.00012877263581488935,
"loss": 0.0677,
"step": 320
},
{
"epoch": 0.1659959758551308,
"grad_norm": 0.7622585892677307,
"learning_rate": 0.00013279678068410465,
"loss": 0.0762,
"step": 330
},
{
"epoch": 0.1710261569416499,
"grad_norm": 1.1940929889678955,
"learning_rate": 0.00013682092555331992,
"loss": 0.059,
"step": 340
},
{
"epoch": 0.176056338028169,
"grad_norm": 0.582521915435791,
"learning_rate": 0.00014084507042253522,
"loss": 0.0732,
"step": 350
},
{
"epoch": 0.18108651911468812,
"grad_norm": 0.858220636844635,
"learning_rate": 0.00014486921529175052,
"loss": 0.06,
"step": 360
},
{
"epoch": 0.18611670020120724,
"grad_norm": 0.5738099217414856,
"learning_rate": 0.0001488933601609658,
"loss": 0.0568,
"step": 370
},
{
"epoch": 0.19114688128772636,
"grad_norm": 0.581499457359314,
"learning_rate": 0.00015291750503018109,
"loss": 0.062,
"step": 380
},
{
"epoch": 0.19617706237424548,
"grad_norm": 0.5559177994728088,
"learning_rate": 0.00015694164989939638,
"loss": 0.0774,
"step": 390
},
{
"epoch": 0.2012072434607646,
"grad_norm": 0.6640293598175049,
"learning_rate": 0.00016096579476861168,
"loss": 0.058,
"step": 400
},
{
"epoch": 0.2062374245472837,
"grad_norm": 0.5883966088294983,
"learning_rate": 0.00016498993963782695,
"loss": 0.0625,
"step": 410
},
{
"epoch": 0.2112676056338028,
"grad_norm": 0.5281286835670471,
"learning_rate": 0.00016901408450704225,
"loss": 0.0733,
"step": 420
},
{
"epoch": 0.21629778672032193,
"grad_norm": 0.46224042773246765,
"learning_rate": 0.00017303822937625755,
"loss": 0.0644,
"step": 430
},
{
"epoch": 0.22132796780684105,
"grad_norm": 0.4865374267101288,
"learning_rate": 0.00017706237424547285,
"loss": 0.0677,
"step": 440
},
{
"epoch": 0.22635814889336017,
"grad_norm": 1.1236791610717773,
"learning_rate": 0.00018108651911468815,
"loss": 0.0719,
"step": 450
},
{
"epoch": 0.23138832997987926,
"grad_norm": 1.1186903715133667,
"learning_rate": 0.00018511066398390342,
"loss": 0.064,
"step": 460
},
{
"epoch": 0.23641851106639838,
"grad_norm": 0.6385508179664612,
"learning_rate": 0.00018913480885311872,
"loss": 0.074,
"step": 470
},
{
"epoch": 0.2414486921529175,
"grad_norm": 1.3683863878250122,
"learning_rate": 0.00019315895372233401,
"loss": 0.0626,
"step": 480
},
{
"epoch": 0.24647887323943662,
"grad_norm": 0.5213209390640259,
"learning_rate": 0.0001971830985915493,
"loss": 0.0585,
"step": 490
},
{
"epoch": 0.2515090543259557,
"grad_norm": 0.988146960735321,
"learning_rate": 0.00019999995019278672,
"loss": 0.0479,
"step": 500
},
{
"epoch": 0.25653923541247486,
"grad_norm": 0.707419216632843,
"learning_rate": 0.0001999990647325972,
"loss": 0.0558,
"step": 510
},
{
"epoch": 0.26156941649899396,
"grad_norm": 0.5720793604850769,
"learning_rate": 0.0001999970724567263,
"loss": 0.0383,
"step": 520
},
{
"epoch": 0.2665995975855131,
"grad_norm": 0.53882896900177,
"learning_rate": 0.00019999397338722502,
"loss": 0.0565,
"step": 530
},
{
"epoch": 0.2716297786720322,
"grad_norm": 0.8112373352050781,
"learning_rate": 0.00019998976755839472,
"loss": 0.0647,
"step": 540
},
{
"epoch": 0.2766599597585513,
"grad_norm": 0.7053709626197815,
"learning_rate": 0.00019998445501678657,
"loss": 0.0533,
"step": 550
},
{
"epoch": 0.28169014084507044,
"grad_norm": 0.40134745836257935,
"learning_rate": 0.0001999780358212011,
"loss": 0.0621,
"step": 560
},
{
"epoch": 0.28672032193158953,
"grad_norm": 0.9213513135910034,
"learning_rate": 0.00019997051004268777,
"loss": 0.0729,
"step": 570
},
{
"epoch": 0.2917505030181087,
"grad_norm": 0.6868863105773926,
"learning_rate": 0.00019996187776454374,
"loss": 0.0636,
"step": 580
},
{
"epoch": 0.29678068410462777,
"grad_norm": 1.0402840375900269,
"learning_rate": 0.0001999521390823134,
"loss": 0.0561,
"step": 590
},
{
"epoch": 0.30181086519114686,
"grad_norm": 0.6680485010147095,
"learning_rate": 0.00019994129410378695,
"loss": 0.0551,
"step": 600
},
{
"epoch": 0.306841046277666,
"grad_norm": 0.6051433682441711,
"learning_rate": 0.00019992934294899944,
"loss": 0.0596,
"step": 610
},
{
"epoch": 0.3118712273641851,
"grad_norm": 0.5517066121101379,
"learning_rate": 0.00019991628575022946,
"loss": 0.057,
"step": 620
},
{
"epoch": 0.31690140845070425,
"grad_norm": 0.4259048104286194,
"learning_rate": 0.00019990212265199738,
"loss": 0.0451,
"step": 630
},
{
"epoch": 0.32193158953722334,
"grad_norm": 0.4159329831600189,
"learning_rate": 0.0001998868538110641,
"loss": 0.0493,
"step": 640
},
{
"epoch": 0.32696177062374243,
"grad_norm": 0.5596720576286316,
"learning_rate": 0.0001998704793964291,
"loss": 0.0556,
"step": 650
},
{
"epoch": 0.3319919517102616,
"grad_norm": 0.821570098400116,
"learning_rate": 0.00019985299958932866,
"loss": 0.0517,
"step": 660
},
{
"epoch": 0.33702213279678067,
"grad_norm": 0.6846859455108643,
"learning_rate": 0.0001998344145832339,
"loss": 0.0367,
"step": 670
},
{
"epoch": 0.3420523138832998,
"grad_norm": 0.6848050951957703,
"learning_rate": 0.00019981472458384844,
"loss": 0.0536,
"step": 680
},
{
"epoch": 0.3470824949698189,
"grad_norm": 2.0011439323425293,
"learning_rate": 0.00019979392980910637,
"loss": 0.0505,
"step": 690
},
{
"epoch": 0.352112676056338,
"grad_norm": 0.7617112994194031,
"learning_rate": 0.00019977203048916962,
"loss": 0.0518,
"step": 700
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.360905259847641,
"learning_rate": 0.00019974902686642558,
"loss": 0.0545,
"step": 710
},
{
"epoch": 0.36217303822937624,
"grad_norm": 0.8060867190361023,
"learning_rate": 0.00019972491919548438,
"loss": 0.0481,
"step": 720
},
{
"epoch": 0.3672032193158954,
"grad_norm": 0.7293612360954285,
"learning_rate": 0.00019969970774317593,
"loss": 0.0676,
"step": 730
},
{
"epoch": 0.3722334004024145,
"grad_norm": 0.5890267491340637,
"learning_rate": 0.00019967339278854714,
"loss": 0.0571,
"step": 740
},
{
"epoch": 0.3772635814889336,
"grad_norm": 0.7763799428939819,
"learning_rate": 0.00019964597462285888,
"loss": 0.057,
"step": 750
},
{
"epoch": 0.3822937625754527,
"grad_norm": 0.5891818404197693,
"learning_rate": 0.00019961745354958246,
"loss": 0.0473,
"step": 760
},
{
"epoch": 0.3873239436619718,
"grad_norm": 0.41080498695373535,
"learning_rate": 0.00019958782988439654,
"loss": 0.0506,
"step": 770
},
{
"epoch": 0.39235412474849096,
"grad_norm": 0.46914100646972656,
"learning_rate": 0.00019955710395518363,
"loss": 0.053,
"step": 780
},
{
"epoch": 0.39738430583501005,
"grad_norm": 0.8423359990119934,
"learning_rate": 0.00019952527610202624,
"loss": 0.0429,
"step": 790
},
{
"epoch": 0.4024144869215292,
"grad_norm": 0.5299778580665588,
"learning_rate": 0.00019949234667720336,
"loss": 0.0499,
"step": 800
},
{
"epoch": 0.4074446680080483,
"grad_norm": 0.5854944586753845,
"learning_rate": 0.00019945831604518645,
"loss": 0.0473,
"step": 810
},
{
"epoch": 0.4124748490945674,
"grad_norm": 0.6145129203796387,
"learning_rate": 0.0001994231845826354,
"loss": 0.0645,
"step": 820
},
{
"epoch": 0.41750503018108653,
"grad_norm": 0.6725190877914429,
"learning_rate": 0.00019938695267839436,
"loss": 0.0511,
"step": 830
},
{
"epoch": 0.4225352112676056,
"grad_norm": 0.5802650451660156,
"learning_rate": 0.0001993496207334875,
"loss": 0.0388,
"step": 840
},
{
"epoch": 0.4275653923541248,
"grad_norm": 0.6143836975097656,
"learning_rate": 0.00019931118916111448,
"loss": 0.043,
"step": 850
},
{
"epoch": 0.43259557344064387,
"grad_norm": 0.6360625624656677,
"learning_rate": 0.00019927165838664598,
"loss": 0.0541,
"step": 860
},
{
"epoch": 0.43762575452716296,
"grad_norm": 0.49990031123161316,
"learning_rate": 0.00019923102884761892,
"loss": 0.0412,
"step": 870
},
{
"epoch": 0.4426559356136821,
"grad_norm": 0.6556900143623352,
"learning_rate": 0.00019918930099373157,
"loss": 0.055,
"step": 880
},
{
"epoch": 0.4476861167002012,
"grad_norm": 0.42863693833351135,
"learning_rate": 0.00019914647528683865,
"loss": 0.046,
"step": 890
},
{
"epoch": 0.45271629778672035,
"grad_norm": 0.31682249903678894,
"learning_rate": 0.00019910255220094634,
"loss": 0.0477,
"step": 900
},
{
"epoch": 0.45774647887323944,
"grad_norm": 0.5968728065490723,
"learning_rate": 0.0001990575322222067,
"loss": 0.0469,
"step": 910
},
{
"epoch": 0.46277665995975853,
"grad_norm": 0.4193131923675537,
"learning_rate": 0.00019901141584891262,
"loss": 0.0584,
"step": 920
},
{
"epoch": 0.4678068410462777,
"grad_norm": 0.5087130069732666,
"learning_rate": 0.00019896420359149207,
"loss": 0.0595,
"step": 930
},
{
"epoch": 0.47283702213279677,
"grad_norm": 0.3542875349521637,
"learning_rate": 0.00019891589597250265,
"loss": 0.0384,
"step": 940
},
{
"epoch": 0.4778672032193159,
"grad_norm": 0.3090812563896179,
"learning_rate": 0.00019886649352662567,
"loss": 0.0427,
"step": 950
},
{
"epoch": 0.482897384305835,
"grad_norm": 0.6682177782058716,
"learning_rate": 0.00019881599680066024,
"loss": 0.0565,
"step": 960
},
{
"epoch": 0.4879275653923541,
"grad_norm": 0.4167076349258423,
"learning_rate": 0.0001987644063535173,
"loss": 0.0392,
"step": 970
},
{
"epoch": 0.49295774647887325,
"grad_norm": 0.4293626546859741,
"learning_rate": 0.00019871172275621332,
"loss": 0.0409,
"step": 980
},
{
"epoch": 0.49798792756539234,
"grad_norm": 0.6032306551933289,
"learning_rate": 0.00019865794659186406,
"loss": 0.0453,
"step": 990
},
{
"epoch": 0.5030181086519114,
"grad_norm": 0.5772558450698853,
"learning_rate": 0.00019860307845567815,
"loss": 0.0413,
"step": 1000
},
{
"epoch": 0.5080482897384306,
"grad_norm": 0.5984110236167908,
"learning_rate": 0.00019854711895495036,
"loss": 0.0551,
"step": 1010
},
{
"epoch": 0.5130784708249497,
"grad_norm": 0.46663838624954224,
"learning_rate": 0.00019849006870905503,
"loss": 0.0369,
"step": 1020
},
{
"epoch": 0.5181086519114688,
"grad_norm": 0.5539775490760803,
"learning_rate": 0.00019843192834943912,
"loss": 0.0378,
"step": 1030
},
{
"epoch": 0.5231388329979879,
"grad_norm": 0.5635536313056946,
"learning_rate": 0.0001983726985196153,
"loss": 0.0394,
"step": 1040
},
{
"epoch": 0.528169014084507,
"grad_norm": 0.6095893979072571,
"learning_rate": 0.00019831237987515474,
"loss": 0.0354,
"step": 1050
},
{
"epoch": 0.5331991951710262,
"grad_norm": 0.4370132386684418,
"learning_rate": 0.00019825097308367987,
"loss": 0.0477,
"step": 1060
},
{
"epoch": 0.5382293762575453,
"grad_norm": 0.7766854763031006,
"learning_rate": 0.00019818847882485704,
"loss": 0.0368,
"step": 1070
},
{
"epoch": 0.5432595573440644,
"grad_norm": 0.8621564507484436,
"learning_rate": 0.00019812489779038903,
"loss": 0.0493,
"step": 1080
},
{
"epoch": 0.5482897384305835,
"grad_norm": 0.41698649525642395,
"learning_rate": 0.00019806023068400723,
"loss": 0.047,
"step": 1090
},
{
"epoch": 0.5533199195171026,
"grad_norm": 0.334310382604599,
"learning_rate": 0.00019799447822146403,
"loss": 0.0352,
"step": 1100
},
{
"epoch": 0.5583501006036218,
"grad_norm": 0.36509206891059875,
"learning_rate": 0.0001979276411305248,
"loss": 0.0387,
"step": 1110
},
{
"epoch": 0.5633802816901409,
"grad_norm": 0.49226775765419006,
"learning_rate": 0.00019785972015095988,
"loss": 0.0412,
"step": 1120
},
{
"epoch": 0.56841046277666,
"grad_norm": 0.48167258501052856,
"learning_rate": 0.0001977907160345363,
"loss": 0.0454,
"step": 1130
},
{
"epoch": 0.5734406438631791,
"grad_norm": 0.43359094858169556,
"learning_rate": 0.00019772062954500965,
"loss": 0.0355,
"step": 1140
},
{
"epoch": 0.5784708249496981,
"grad_norm": 0.4004541039466858,
"learning_rate": 0.00019764946145811542,
"loss": 0.0401,
"step": 1150
},
{
"epoch": 0.5835010060362174,
"grad_norm": 0.4662073254585266,
"learning_rate": 0.00019757721256156047,
"loss": 0.0382,
"step": 1160
},
{
"epoch": 0.5885311871227364,
"grad_norm": 0.3257390260696411,
"learning_rate": 0.00019750388365501447,
"loss": 0.042,
"step": 1170
},
{
"epoch": 0.5935613682092555,
"grad_norm": 0.5180749297142029,
"learning_rate": 0.0001974294755501008,
"loss": 0.0428,
"step": 1180
},
{
"epoch": 0.5985915492957746,
"grad_norm": 0.4941740334033966,
"learning_rate": 0.00019735398907038779,
"loss": 0.04,
"step": 1190
},
{
"epoch": 0.6036217303822937,
"grad_norm": 0.3722670078277588,
"learning_rate": 0.00019727742505137936,
"loss": 0.0493,
"step": 1200
},
{
"epoch": 0.6086519114688129,
"grad_norm": 0.32677406072616577,
"learning_rate": 0.0001971997843405061,
"loss": 0.0462,
"step": 1210
},
{
"epoch": 0.613682092555332,
"grad_norm": 0.48656272888183594,
"learning_rate": 0.00019712106779711555,
"loss": 0.0451,
"step": 1220
},
{
"epoch": 0.6187122736418511,
"grad_norm": 0.4287594258785248,
"learning_rate": 0.00019704127629246293,
"loss": 0.0294,
"step": 1230
},
{
"epoch": 0.6237424547283702,
"grad_norm": 0.46294787526130676,
"learning_rate": 0.0001969604107097014,
"loss": 0.0462,
"step": 1240
},
{
"epoch": 0.6287726358148893,
"grad_norm": 0.32905468344688416,
"learning_rate": 0.00019687847194387221,
"loss": 0.0324,
"step": 1250
},
{
"epoch": 0.6338028169014085,
"grad_norm": 0.4024835228919983,
"learning_rate": 0.00019679546090189503,
"loss": 0.0407,
"step": 1260
},
{
"epoch": 0.6388329979879276,
"grad_norm": 0.539588451385498,
"learning_rate": 0.00019671137850255766,
"loss": 0.0368,
"step": 1270
},
{
"epoch": 0.6438631790744467,
"grad_norm": 0.32176288962364197,
"learning_rate": 0.00019662622567650595,
"loss": 0.0344,
"step": 1280
},
{
"epoch": 0.6488933601609658,
"grad_norm": 0.45465490221977234,
"learning_rate": 0.0001965400033662336,
"loss": 0.0361,
"step": 1290
},
{
"epoch": 0.6539235412474849,
"grad_norm": 0.47942492365837097,
"learning_rate": 0.00019645271252607155,
"loss": 0.0408,
"step": 1300
},
{
"epoch": 0.6589537223340041,
"grad_norm": 0.2223718762397766,
"learning_rate": 0.00019636435412217758,
"loss": 0.0347,
"step": 1310
},
{
"epoch": 0.6639839034205232,
"grad_norm": 0.550037682056427,
"learning_rate": 0.00019627492913252547,
"loss": 0.0381,
"step": 1320
},
{
"epoch": 0.6690140845070423,
"grad_norm": 0.39280569553375244,
"learning_rate": 0.0001961844385468943,
"loss": 0.0455,
"step": 1330
},
{
"epoch": 0.6740442655935613,
"grad_norm": 0.64002925157547,
"learning_rate": 0.00019609288336685742,
"loss": 0.0495,
"step": 1340
},
{
"epoch": 0.6790744466800804,
"grad_norm": 0.34422725439071655,
"learning_rate": 0.00019600026460577142,
"loss": 0.0309,
"step": 1350
},
{
"epoch": 0.6841046277665996,
"grad_norm": 0.42025962471961975,
"learning_rate": 0.00019590658328876484,
"loss": 0.0479,
"step": 1360
},
{
"epoch": 0.6891348088531187,
"grad_norm": 0.48690280318260193,
"learning_rate": 0.0001958118404527269,
"loss": 0.0352,
"step": 1370
},
{
"epoch": 0.6941649899396378,
"grad_norm": 0.4300714433193207,
"learning_rate": 0.0001957160371462959,
"loss": 0.0367,
"step": 1380
},
{
"epoch": 0.6991951710261569,
"grad_norm": 0.3296925723552704,
"learning_rate": 0.00019561917442984788,
"loss": 0.0399,
"step": 1390
},
{
"epoch": 0.704225352112676,
"grad_norm": 0.5653759837150574,
"learning_rate": 0.00019552125337548462,
"loss": 0.0354,
"step": 1400
},
{
"epoch": 0.7092555331991952,
"grad_norm": 0.39718741178512573,
"learning_rate": 0.00019542227506702173,
"loss": 0.0514,
"step": 1410
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.5332531332969666,
"learning_rate": 0.00019532224059997692,
"loss": 0.0395,
"step": 1420
},
{
"epoch": 0.7193158953722334,
"grad_norm": 0.41001367568969727,
"learning_rate": 0.0001952211510815578,
"loss": 0.0465,
"step": 1430
},
{
"epoch": 0.7243460764587525,
"grad_norm": 0.4375529885292053,
"learning_rate": 0.0001951190076306494,
"loss": 0.0332,
"step": 1440
},
{
"epoch": 0.7293762575452716,
"grad_norm": 0.3857330083847046,
"learning_rate": 0.00019501581137780204,
"loss": 0.0372,
"step": 1450
},
{
"epoch": 0.7344064386317908,
"grad_norm": 0.2521601915359497,
"learning_rate": 0.0001949115634652187,
"loss": 0.0342,
"step": 1460
},
{
"epoch": 0.7394366197183099,
"grad_norm": 0.32540401816368103,
"learning_rate": 0.00019480626504674245,
"loss": 0.0301,
"step": 1470
},
{
"epoch": 0.744466800804829,
"grad_norm": 0.27026352286338806,
"learning_rate": 0.00019469991728784356,
"loss": 0.0354,
"step": 1480
},
{
"epoch": 0.7494969818913481,
"grad_norm": 0.5376250147819519,
"learning_rate": 0.00019459252136560674,
"loss": 0.0429,
"step": 1490
},
{
"epoch": 0.7545271629778671,
"grad_norm": 0.36513984203338623,
"learning_rate": 0.00019448407846871804,
"loss": 0.0396,
"step": 1500
},
{
"epoch": 0.7595573440643864,
"grad_norm": 0.16466562449932098,
"learning_rate": 0.0001943745897974516,
"loss": 0.0252,
"step": 1510
},
{
"epoch": 0.7645875251509054,
"grad_norm": 0.2507416307926178,
"learning_rate": 0.0001942640565636566,
"loss": 0.0338,
"step": 1520
},
{
"epoch": 0.7696177062374245,
"grad_norm": 0.27079200744628906,
"learning_rate": 0.0001941524799907436,
"loss": 0.0331,
"step": 1530
},
{
"epoch": 0.7746478873239436,
"grad_norm": 0.5537086129188538,
"learning_rate": 0.00019403986131367123,
"loss": 0.0312,
"step": 1540
},
{
"epoch": 0.7796780684104627,
"grad_norm": 0.37343883514404297,
"learning_rate": 0.00019392620177893224,
"loss": 0.0447,
"step": 1550
},
{
"epoch": 0.7847082494969819,
"grad_norm": 0.32680392265319824,
"learning_rate": 0.00019381150264454,
"loss": 0.0275,
"step": 1560
},
{
"epoch": 0.789738430583501,
"grad_norm": 0.64634770154953,
"learning_rate": 0.00019369576518001437,
"loss": 0.034,
"step": 1570
},
{
"epoch": 0.7947686116700201,
"grad_norm": 0.2528303861618042,
"learning_rate": 0.00019357899066636773,
"loss": 0.0298,
"step": 1580
},
{
"epoch": 0.7997987927565392,
"grad_norm": 0.4246571660041809,
"learning_rate": 0.00019346118039609086,
"loss": 0.0446,
"step": 1590
},
{
"epoch": 0.8048289738430584,
"grad_norm": 0.3372986316680908,
"learning_rate": 0.0001933423356731384,
"loss": 0.0363,
"step": 1600
},
{
"epoch": 0.8098591549295775,
"grad_norm": 0.49657967686653137,
"learning_rate": 0.00019322245781291475,
"loss": 0.039,
"step": 1610
},
{
"epoch": 0.8148893360160966,
"grad_norm": 0.24667495489120483,
"learning_rate": 0.00019310154814225925,
"loss": 0.0326,
"step": 1620
},
{
"epoch": 0.8199195171026157,
"grad_norm": 0.43383368849754333,
"learning_rate": 0.00019297960799943161,
"loss": 0.0402,
"step": 1630
},
{
"epoch": 0.8249496981891348,
"grad_norm": 0.513130247592926,
"learning_rate": 0.00019285663873409715,
"loss": 0.0357,
"step": 1640
},
{
"epoch": 0.829979879275654,
"grad_norm": 0.4496906101703644,
"learning_rate": 0.00019273264170731157,
"loss": 0.0287,
"step": 1650
},
{
"epoch": 0.8350100603621731,
"grad_norm": 0.467529296875,
"learning_rate": 0.00019260761829150637,
"loss": 0.0437,
"step": 1660
},
{
"epoch": 0.8400402414486922,
"grad_norm": 0.402126282453537,
"learning_rate": 0.0001924815698704732,
"loss": 0.0336,
"step": 1670
},
{
"epoch": 0.8450704225352113,
"grad_norm": 0.41219964623451233,
"learning_rate": 0.00019235449783934881,
"loss": 0.0353,
"step": 1680
},
{
"epoch": 0.8501006036217303,
"grad_norm": 0.6443662047386169,
"learning_rate": 0.00019222640360459954,
"loss": 0.0515,
"step": 1690
},
{
"epoch": 0.8551307847082495,
"grad_norm": 0.3046891391277313,
"learning_rate": 0.0001920972885840057,
"loss": 0.0339,
"step": 1700
},
{
"epoch": 0.8601609657947686,
"grad_norm": 0.5683744549751282,
"learning_rate": 0.00019196715420664596,
"loss": 0.0377,
"step": 1710
},
{
"epoch": 0.8651911468812877,
"grad_norm": 0.46794387698173523,
"learning_rate": 0.0001918360019128815,
"loss": 0.0414,
"step": 1720
},
{
"epoch": 0.8702213279678068,
"grad_norm": 0.4453373849391937,
"learning_rate": 0.00019170383315434002,
"loss": 0.0289,
"step": 1730
},
{
"epoch": 0.8752515090543259,
"grad_norm": 0.44001007080078125,
"learning_rate": 0.00019157064939389978,
"loss": 0.0433,
"step": 1740
},
{
"epoch": 0.8802816901408451,
"grad_norm": 0.28705525398254395,
"learning_rate": 0.00019143645210567328,
"loss": 0.0327,
"step": 1750
},
{
"epoch": 0.8853118712273642,
"grad_norm": 0.49537310004234314,
"learning_rate": 0.00019130124277499109,
"loss": 0.0375,
"step": 1760
},
{
"epoch": 0.8903420523138833,
"grad_norm": 0.5446850657463074,
"learning_rate": 0.00019116502289838523,
"loss": 0.0357,
"step": 1770
},
{
"epoch": 0.8953722334004024,
"grad_norm": 0.49629637598991394,
"learning_rate": 0.0001910277939835728,
"loss": 0.0291,
"step": 1780
},
{
"epoch": 0.9004024144869215,
"grad_norm": 0.3879849910736084,
"learning_rate": 0.00019088955754943912,
"loss": 0.0397,
"step": 1790
},
{
"epoch": 0.9054325955734407,
"grad_norm": 0.4239087402820587,
"learning_rate": 0.00019075031512602104,
"loss": 0.0417,
"step": 1800
},
{
"epoch": 0.9104627766599598,
"grad_norm": 0.5208977460861206,
"learning_rate": 0.00019061006825448997,
"loss": 0.0345,
"step": 1810
},
{
"epoch": 0.9154929577464789,
"grad_norm": 0.28360894322395325,
"learning_rate": 0.0001904688184871348,
"loss": 0.0302,
"step": 1820
},
{
"epoch": 0.920523138832998,
"grad_norm": 0.41871729493141174,
"learning_rate": 0.00019032656738734467,
"loss": 0.0381,
"step": 1830
},
{
"epoch": 0.9255533199195171,
"grad_norm": 0.3086782693862915,
"learning_rate": 0.00019018331652959184,
"loss": 0.0397,
"step": 1840
},
{
"epoch": 0.9305835010060363,
"grad_norm": 0.45297491550445557,
"learning_rate": 0.00019003906749941405,
"loss": 0.0347,
"step": 1850
},
{
"epoch": 0.9356136820925554,
"grad_norm": 0.576064944267273,
"learning_rate": 0.00018989382189339718,
"loss": 0.033,
"step": 1860
},
{
"epoch": 0.9406438631790744,
"grad_norm": 0.4694220721721649,
"learning_rate": 0.00018974758131915732,
"loss": 0.0279,
"step": 1870
},
{
"epoch": 0.9456740442655935,
"grad_norm": 0.3088204264640808,
"learning_rate": 0.00018960034739532336,
"loss": 0.0344,
"step": 1880
},
{
"epoch": 0.9507042253521126,
"grad_norm": 0.3804841637611389,
"learning_rate": 0.00018945212175151856,
"loss": 0.0421,
"step": 1890
},
{
"epoch": 0.9557344064386318,
"grad_norm": 0.4735386073589325,
"learning_rate": 0.00018930290602834298,
"loss": 0.0434,
"step": 1900
},
{
"epoch": 0.9607645875251509,
"grad_norm": 0.6512228846549988,
"learning_rate": 0.0001891527018773551,
"loss": 0.0332,
"step": 1910
},
{
"epoch": 0.96579476861167,
"grad_norm": 0.3166397213935852,
"learning_rate": 0.00018900151096105358,
"loss": 0.0271,
"step": 1920
},
{
"epoch": 0.9708249496981891,
"grad_norm": 0.36923766136169434,
"learning_rate": 0.00018884933495285882,
"loss": 0.0339,
"step": 1930
},
{
"epoch": 0.9758551307847082,
"grad_norm": 0.43903985619544983,
"learning_rate": 0.0001886961755370945,
"loss": 0.0289,
"step": 1940
},
{
"epoch": 0.9808853118712274,
"grad_norm": 0.4627651274204254,
"learning_rate": 0.00018854203440896888,
"loss": 0.0356,
"step": 1950
},
{
"epoch": 0.9859154929577465,
"grad_norm": 0.42779436707496643,
"learning_rate": 0.0001883869132745561,
"loss": 0.0346,
"step": 1960
},
{
"epoch": 0.9909456740442656,
"grad_norm": 0.4050188660621643,
"learning_rate": 0.00018823081385077733,
"loss": 0.0365,
"step": 1970
},
{
"epoch": 0.9959758551307847,
"grad_norm": 0.28512129187583923,
"learning_rate": 0.00018807373786538153,
"loss": 0.0399,
"step": 1980
},
{
"epoch": 1.0010060362173039,
"grad_norm": 0.3914794623851776,
"learning_rate": 0.00018791568705692668,
"loss": 0.034,
"step": 1990
},
{
"epoch": 1.0060362173038229,
"grad_norm": 0.4772788882255554,
"learning_rate": 0.00018775666317476022,
"loss": 0.0308,
"step": 2000
},
{
"epoch": 1.011066398390342,
"grad_norm": 0.2963551878929138,
"learning_rate": 0.0001875966679789999,
"loss": 0.0309,
"step": 2010
},
{
"epoch": 1.0160965794768613,
"grad_norm": 0.17126326262950897,
"learning_rate": 0.0001874357032405142,
"loss": 0.0264,
"step": 2020
},
{
"epoch": 1.0211267605633803,
"grad_norm": 0.3092329800128937,
"learning_rate": 0.00018727377074090272,
"loss": 0.0325,
"step": 2030
},
{
"epoch": 1.0261569416498995,
"grad_norm": 0.23635587096214294,
"learning_rate": 0.00018711087227247657,
"loss": 0.0318,
"step": 2040
},
{
"epoch": 1.0311871227364184,
"grad_norm": 0.4015599489212036,
"learning_rate": 0.00018694700963823837,
"loss": 0.0276,
"step": 2050
},
{
"epoch": 1.0362173038229376,
"grad_norm": 0.3345971405506134,
"learning_rate": 0.00018678218465186243,
"loss": 0.0327,
"step": 2060
},
{
"epoch": 1.0412474849094568,
"grad_norm": 0.5647363066673279,
"learning_rate": 0.0001866163991376746,
"loss": 0.0331,
"step": 2070
},
{
"epoch": 1.0462776659959758,
"grad_norm": 0.3128892481327057,
"learning_rate": 0.0001864496549306321,
"loss": 0.0377,
"step": 2080
},
{
"epoch": 1.051307847082495,
"grad_norm": 0.45699113607406616,
"learning_rate": 0.00018628195387630325,
"loss": 0.0285,
"step": 2090
},
{
"epoch": 1.056338028169014,
"grad_norm": 0.2944326400756836,
"learning_rate": 0.00018611329783084697,
"loss": 0.0337,
"step": 2100
},
{
"epoch": 1.0613682092555332,
"grad_norm": 0.5147525668144226,
"learning_rate": 0.00018594368866099226,
"loss": 0.0403,
"step": 2110
},
{
"epoch": 1.0663983903420524,
"grad_norm": 0.3949408233165741,
"learning_rate": 0.0001857731282440176,
"loss": 0.0398,
"step": 2120
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.6203738451004028,
"learning_rate": 0.00018560161846773002,
"loss": 0.0374,
"step": 2130
},
{
"epoch": 1.0764587525150906,
"grad_norm": 0.5248531699180603,
"learning_rate": 0.00018542916123044444,
"loss": 0.035,
"step": 2140
},
{
"epoch": 1.0814889336016096,
"grad_norm": 0.25072234869003296,
"learning_rate": 0.00018525575844096243,
"loss": 0.0315,
"step": 2150
},
{
"epoch": 1.0865191146881288,
"grad_norm": 0.46970999240875244,
"learning_rate": 0.00018508141201855125,
"loss": 0.0473,
"step": 2160
},
{
"epoch": 1.091549295774648,
"grad_norm": 0.4322168231010437,
"learning_rate": 0.00018490612389292243,
"loss": 0.0322,
"step": 2170
},
{
"epoch": 1.096579476861167,
"grad_norm": 0.31416237354278564,
"learning_rate": 0.0001847298960042106,
"loss": 0.026,
"step": 2180
},
{
"epoch": 1.1016096579476862,
"grad_norm": 0.4525179862976074,
"learning_rate": 0.0001845527303029519,
"loss": 0.0341,
"step": 2190
},
{
"epoch": 1.1066398390342052,
"grad_norm": 0.3020547926425934,
"learning_rate": 0.00018437462875006247,
"loss": 0.0362,
"step": 2200
},
{
"epoch": 1.1116700201207244,
"grad_norm": 0.24597640335559845,
"learning_rate": 0.0001841955933168166,
"loss": 0.0319,
"step": 2210
},
{
"epoch": 1.1167002012072436,
"grad_norm": 0.5903005003929138,
"learning_rate": 0.00018401562598482517,
"loss": 0.0302,
"step": 2220
},
{
"epoch": 1.1217303822937625,
"grad_norm": 0.3023013174533844,
"learning_rate": 0.00018383472874601334,
"loss": 0.0372,
"step": 2230
},
{
"epoch": 1.1267605633802817,
"grad_norm": 0.39982765913009644,
"learning_rate": 0.00018365290360259894,
"loss": 0.0285,
"step": 2240
},
{
"epoch": 1.1317907444668007,
"grad_norm": 0.32488811016082764,
"learning_rate": 0.00018347015256706998,
"loss": 0.0301,
"step": 2250
},
{
"epoch": 1.13682092555332,
"grad_norm": 0.3721481263637543,
"learning_rate": 0.00018328647766216246,
"loss": 0.0466,
"step": 2260
},
{
"epoch": 1.1418511066398391,
"grad_norm": 0.37539154291152954,
"learning_rate": 0.00018310188092083803,
"loss": 0.0305,
"step": 2270
},
{
"epoch": 1.1468812877263581,
"grad_norm": 0.4233405590057373,
"learning_rate": 0.00018291636438626152,
"loss": 0.0383,
"step": 2280
},
{
"epoch": 1.1519114688128773,
"grad_norm": 0.3756147623062134,
"learning_rate": 0.00018272993011177822,
"loss": 0.0339,
"step": 2290
},
{
"epoch": 1.1569416498993963,
"grad_norm": 0.2825983762741089,
"learning_rate": 0.00018254258016089123,
"loss": 0.0323,
"step": 2300
},
{
"epoch": 1.1619718309859155,
"grad_norm": 0.3585676848888397,
"learning_rate": 0.00018235431660723855,
"loss": 0.0257,
"step": 2310
},
{
"epoch": 1.1670020120724347,
"grad_norm": 0.34615543484687805,
"learning_rate": 0.00018216514153457025,
"loss": 0.031,
"step": 2320
},
{
"epoch": 1.1720321931589537,
"grad_norm": 0.3647988438606262,
"learning_rate": 0.00018197505703672522,
"loss": 0.0298,
"step": 2330
},
{
"epoch": 1.1770623742454729,
"grad_norm": 0.2951864004135132,
"learning_rate": 0.0001817840652176082,
"loss": 0.0295,
"step": 2340
},
{
"epoch": 1.1820925553319919,
"grad_norm": 0.24966521561145782,
"learning_rate": 0.00018159216819116635,
"loss": 0.0331,
"step": 2350
},
{
"epoch": 1.187122736418511,
"grad_norm": 0.31395167112350464,
"learning_rate": 0.00018139936808136593,
"loss": 0.0299,
"step": 2360
},
{
"epoch": 1.1921529175050303,
"grad_norm": 0.2532835006713867,
"learning_rate": 0.00018120566702216877,
"loss": 0.0327,
"step": 2370
},
{
"epoch": 1.1971830985915493,
"grad_norm": 0.30898261070251465,
"learning_rate": 0.00018101106715750855,
"loss": 0.026,
"step": 2380
},
{
"epoch": 1.2022132796780685,
"grad_norm": 0.34393829107284546,
"learning_rate": 0.0001808155706412673,
"loss": 0.0413,
"step": 2390
},
{
"epoch": 1.2072434607645874,
"grad_norm": 0.32124215364456177,
"learning_rate": 0.00018061917963725134,
"loss": 0.0282,
"step": 2400
},
{
"epoch": 1.2122736418511066,
"grad_norm": 0.4290277063846588,
"learning_rate": 0.0001804218963191674,
"loss": 0.0316,
"step": 2410
},
{
"epoch": 1.2173038229376258,
"grad_norm": 0.43430641293525696,
"learning_rate": 0.00018022372287059866,
"loss": 0.031,
"step": 2420
},
{
"epoch": 1.2223340040241448,
"grad_norm": 0.2853562831878662,
"learning_rate": 0.00018002466148498035,
"loss": 0.028,
"step": 2430
},
{
"epoch": 1.227364185110664,
"grad_norm": 0.40154245495796204,
"learning_rate": 0.00017982471436557579,
"loss": 0.0309,
"step": 2440
},
{
"epoch": 1.232394366197183,
"grad_norm": 0.45144811272621155,
"learning_rate": 0.00017962388372545177,
"loss": 0.0313,
"step": 2450
},
{
"epoch": 1.2374245472837022,
"grad_norm": 0.310092031955719,
"learning_rate": 0.00017942217178745396,
"loss": 0.0257,
"step": 2460
},
{
"epoch": 1.2424547283702214,
"grad_norm": 0.31381756067276,
"learning_rate": 0.00017921958078418278,
"loss": 0.0289,
"step": 2470
},
{
"epoch": 1.2474849094567404,
"grad_norm": 0.2533479630947113,
"learning_rate": 0.00017901611295796806,
"loss": 0.0307,
"step": 2480
},
{
"epoch": 1.2525150905432596,
"grad_norm": 0.2752733826637268,
"learning_rate": 0.0001788117705608447,
"loss": 0.0358,
"step": 2490
},
{
"epoch": 1.2575452716297786,
"grad_norm": 0.40528246760368347,
"learning_rate": 0.00017860655585452763,
"loss": 0.0358,
"step": 2500
},
{
"epoch": 1.2625754527162978,
"grad_norm": 0.37113526463508606,
"learning_rate": 0.00017840047111038651,
"loss": 0.0319,
"step": 2510
},
{
"epoch": 1.267605633802817,
"grad_norm": 0.37338030338287354,
"learning_rate": 0.00017819351860942105,
"loss": 0.0292,
"step": 2520
},
{
"epoch": 1.272635814889336,
"grad_norm": 0.4197677671909332,
"learning_rate": 0.00017798570064223533,
"loss": 0.0335,
"step": 2530
},
{
"epoch": 1.2776659959758552,
"grad_norm": 0.3801611363887787,
"learning_rate": 0.0001777770195090128,
"loss": 0.0364,
"step": 2540
},
{
"epoch": 1.2826961770623742,
"grad_norm": 0.49140316247940063,
"learning_rate": 0.00017756747751949045,
"loss": 0.0244,
"step": 2550
},
{
"epoch": 1.2877263581488934,
"grad_norm": 0.37291839718818665,
"learning_rate": 0.00017735707699293368,
"loss": 0.0375,
"step": 2560
},
{
"epoch": 1.2927565392354126,
"grad_norm": 0.27540504932403564,
"learning_rate": 0.0001771458202581102,
"loss": 0.036,
"step": 2570
},
{
"epoch": 1.2977867203219315,
"grad_norm": 0.3908439576625824,
"learning_rate": 0.00017693370965326467,
"loss": 0.0343,
"step": 2580
},
{
"epoch": 1.3028169014084507,
"grad_norm": 0.46150150895118713,
"learning_rate": 0.00017672074752609235,
"loss": 0.0277,
"step": 2590
},
{
"epoch": 1.3078470824949697,
"grad_norm": 0.3451060950756073,
"learning_rate": 0.0001765069362337136,
"loss": 0.0366,
"step": 2600
},
{
"epoch": 1.312877263581489,
"grad_norm": 0.5258057117462158,
"learning_rate": 0.00017629227814264738,
"loss": 0.0305,
"step": 2610
},
{
"epoch": 1.3179074446680081,
"grad_norm": 0.3369615972042084,
"learning_rate": 0.00017607677562878543,
"loss": 0.0303,
"step": 2620
},
{
"epoch": 1.3229376257545271,
"grad_norm": 0.48052117228507996,
"learning_rate": 0.00017586043107736558,
"loss": 0.0353,
"step": 2630
},
{
"epoch": 1.3279678068410463,
"grad_norm": 0.44058990478515625,
"learning_rate": 0.0001756432468829457,
"loss": 0.0357,
"step": 2640
},
{
"epoch": 1.3329979879275653,
"grad_norm": 0.3025529980659485,
"learning_rate": 0.000175425225449377,
"loss": 0.03,
"step": 2650
},
{
"epoch": 1.3380281690140845,
"grad_norm": 0.3708013892173767,
"learning_rate": 0.00017520636918977743,
"loss": 0.0263,
"step": 2660
},
{
"epoch": 1.3430583501006037,
"grad_norm": 0.41252729296684265,
"learning_rate": 0.00017498668052650502,
"loss": 0.0376,
"step": 2670
},
{
"epoch": 1.3480885311871227,
"grad_norm": 0.4583453834056854,
"learning_rate": 0.0001747661618911312,
"loss": 0.035,
"step": 2680
},
{
"epoch": 1.3531187122736419,
"grad_norm": 0.26764222979545593,
"learning_rate": 0.00017454481572441353,
"loss": 0.0258,
"step": 2690
},
{
"epoch": 1.3581488933601609,
"grad_norm": 0.36722269654273987,
"learning_rate": 0.000174322644476269,
"loss": 0.025,
"step": 2700
},
{
"epoch": 1.36317907444668,
"grad_norm": 0.2594809830188751,
"learning_rate": 0.0001740996506057469,
"loss": 0.0271,
"step": 2710
},
{
"epoch": 1.3682092555331993,
"grad_norm": 0.36106789112091064,
"learning_rate": 0.00017387583658100142,
"loss": 0.0293,
"step": 2720
},
{
"epoch": 1.3732394366197183,
"grad_norm": 0.3363661468029022,
"learning_rate": 0.00017365120487926441,
"loss": 0.0249,
"step": 2730
},
{
"epoch": 1.3782696177062375,
"grad_norm": 0.275418758392334,
"learning_rate": 0.000173425757986818,
"loss": 0.028,
"step": 2740
},
{
"epoch": 1.3832997987927564,
"grad_norm": 0.26659509539604187,
"learning_rate": 0.00017319949839896722,
"loss": 0.023,
"step": 2750
},
{
"epoch": 1.3883299798792756,
"grad_norm": 0.461508184671402,
"learning_rate": 0.00017297242862001198,
"loss": 0.0302,
"step": 2760
},
{
"epoch": 1.3933601609657948,
"grad_norm": 0.30697330832481384,
"learning_rate": 0.0001727445511632197,
"loss": 0.0252,
"step": 2770
},
{
"epoch": 1.3983903420523138,
"grad_norm": 0.35531529784202576,
"learning_rate": 0.00017251586855079746,
"loss": 0.0251,
"step": 2780
},
{
"epoch": 1.403420523138833,
"grad_norm": 0.4393905997276306,
"learning_rate": 0.0001722863833138639,
"loss": 0.0339,
"step": 2790
},
{
"epoch": 1.408450704225352,
"grad_norm": 0.59965980052948,
"learning_rate": 0.00017205609799242138,
"loss": 0.0344,
"step": 2800
},
{
"epoch": 1.4134808853118712,
"grad_norm": 0.2698121964931488,
"learning_rate": 0.0001718250151353278,
"loss": 0.0309,
"step": 2810
},
{
"epoch": 1.4185110663983904,
"grad_norm": 0.5065796375274658,
"learning_rate": 0.00017159313730026837,
"loss": 0.0271,
"step": 2820
},
{
"epoch": 1.4235412474849094,
"grad_norm": 0.39271390438079834,
"learning_rate": 0.0001713604670537273,
"loss": 0.0362,
"step": 2830
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.2914319932460785,
"learning_rate": 0.00017112700697095954,
"loss": 0.036,
"step": 2840
},
{
"epoch": 1.4336016096579476,
"grad_norm": 0.33919307589530945,
"learning_rate": 0.00017089275963596195,
"loss": 0.0303,
"step": 2850
},
{
"epoch": 1.4386317907444668,
"grad_norm": 0.38898414373397827,
"learning_rate": 0.0001706577276414451,
"loss": 0.0336,
"step": 2860
},
{
"epoch": 1.443661971830986,
"grad_norm": 0.3567386567592621,
"learning_rate": 0.00017042191358880424,
"loss": 0.0315,
"step": 2870
},
{
"epoch": 1.448692152917505,
"grad_norm": 0.5339713096618652,
"learning_rate": 0.00017018532008809074,
"loss": 0.0299,
"step": 2880
},
{
"epoch": 1.4537223340040242,
"grad_norm": 0.4281061887741089,
"learning_rate": 0.00016994794975798305,
"loss": 0.0327,
"step": 2890
},
{
"epoch": 1.4587525150905432,
"grad_norm": 0.3700096905231476,
"learning_rate": 0.00016970980522575775,
"loss": 0.0299,
"step": 2900
},
{
"epoch": 1.4637826961770624,
"grad_norm": 0.41299402713775635,
"learning_rate": 0.00016947088912726052,
"loss": 0.0347,
"step": 2910
},
{
"epoch": 1.4688128772635816,
"grad_norm": 0.2901936173439026,
"learning_rate": 0.00016923120410687695,
"loss": 0.0297,
"step": 2920
},
{
"epoch": 1.4738430583501005,
"grad_norm": 0.4472912549972534,
"learning_rate": 0.00016899075281750326,
"loss": 0.0274,
"step": 2930
},
{
"epoch": 1.4788732394366197,
"grad_norm": 0.3437786102294922,
"learning_rate": 0.00016874953792051693,
"loss": 0.0353,
"step": 2940
},
{
"epoch": 1.4839034205231387,
"grad_norm": 0.31530702114105225,
"learning_rate": 0.00016850756208574717,
"loss": 0.0369,
"step": 2950
},
{
"epoch": 1.488933601609658,
"grad_norm": 0.37885403633117676,
"learning_rate": 0.00016826482799144556,
"loss": 0.0326,
"step": 2960
},
{
"epoch": 1.4939637826961771,
"grad_norm": 0.3617626428604126,
"learning_rate": 0.00016802133832425625,
"loss": 0.0294,
"step": 2970
},
{
"epoch": 1.4989939637826961,
"grad_norm": 0.34277963638305664,
"learning_rate": 0.0001677770957791862,
"loss": 0.0269,
"step": 2980
},
{
"epoch": 1.5040241448692153,
"grad_norm": 0.46021127700805664,
"learning_rate": 0.00016753210305957557,
"loss": 0.0307,
"step": 2990
},
{
"epoch": 1.5090543259557343,
"grad_norm": 0.5008649230003357,
"learning_rate": 0.0001672863628770675,
"loss": 0.0434,
"step": 3000
},
{
"epoch": 1.5140845070422535,
"grad_norm": 0.2918837368488312,
"learning_rate": 0.0001670398779515784,
"loss": 0.0297,
"step": 3010
},
{
"epoch": 1.5191146881287727,
"grad_norm": 0.30590662360191345,
"learning_rate": 0.00016679265101126743,
"loss": 0.03,
"step": 3020
},
{
"epoch": 1.524144869215292,
"grad_norm": 0.29896557331085205,
"learning_rate": 0.00016654468479250688,
"loss": 0.0332,
"step": 3030
},
{
"epoch": 1.529175050301811,
"grad_norm": 0.5691266059875488,
"learning_rate": 0.00016629598203985135,
"loss": 0.0312,
"step": 3040
},
{
"epoch": 1.5342052313883299,
"grad_norm": 0.5821362733840942,
"learning_rate": 0.00016604654550600762,
"loss": 0.0329,
"step": 3050
},
{
"epoch": 1.539235412474849,
"grad_norm": 0.3751659691333771,
"learning_rate": 0.00016579637795180425,
"loss": 0.029,
"step": 3060
},
{
"epoch": 1.5442655935613683,
"grad_norm": 0.30066096782684326,
"learning_rate": 0.0001655454821461608,
"loss": 0.0207,
"step": 3070
},
{
"epoch": 1.5492957746478875,
"grad_norm": 0.5700194239616394,
"learning_rate": 0.00016529386086605737,
"loss": 0.0464,
"step": 3080
},
{
"epoch": 1.5543259557344065,
"grad_norm": 0.3803764581680298,
"learning_rate": 0.00016504151689650386,
"loss": 0.0295,
"step": 3090
},
{
"epoch": 1.5593561368209254,
"grad_norm": 0.28992944955825806,
"learning_rate": 0.0001647884530305089,
"loss": 0.0339,
"step": 3100
},
{
"epoch": 1.5643863179074446,
"grad_norm": 0.1968836933374405,
"learning_rate": 0.00016453467206904935,
"loss": 0.027,
"step": 3110
},
{
"epoch": 1.5694164989939638,
"grad_norm": 0.4015430510044098,
"learning_rate": 0.00016428017682103892,
"loss": 0.0308,
"step": 3120
},
{
"epoch": 1.574446680080483,
"grad_norm": 0.2642582356929779,
"learning_rate": 0.00016402497010329725,
"loss": 0.0293,
"step": 3130
},
{
"epoch": 1.579476861167002,
"grad_norm": 0.446898877620697,
"learning_rate": 0.00016376905474051873,
"loss": 0.0274,
"step": 3140
},
{
"epoch": 1.584507042253521,
"grad_norm": 0.4257184565067291,
"learning_rate": 0.00016351243356524123,
"loss": 0.032,
"step": 3150
},
{
"epoch": 1.5895372233400402,
"grad_norm": 0.3191612660884857,
"learning_rate": 0.00016325510941781474,
"loss": 0.0286,
"step": 3160
},
{
"epoch": 1.5945674044265594,
"grad_norm": 0.2913196086883545,
"learning_rate": 0.00016299708514636992,
"loss": 0.0331,
"step": 3170
},
{
"epoch": 1.5995975855130786,
"grad_norm": 0.35634562373161316,
"learning_rate": 0.00016273836360678652,
"loss": 0.0289,
"step": 3180
},
{
"epoch": 1.6046277665995976,
"grad_norm": 0.4977535009384155,
"learning_rate": 0.00016247894766266196,
"loss": 0.0308,
"step": 3190
},
{
"epoch": 1.6096579476861166,
"grad_norm": 0.32202795147895813,
"learning_rate": 0.0001622188401852794,
"loss": 0.0297,
"step": 3200
},
{
"epoch": 1.6146881287726358,
"grad_norm": 0.18511469662189484,
"learning_rate": 0.00016195804405357613,
"loss": 0.0243,
"step": 3210
},
{
"epoch": 1.619718309859155,
"grad_norm": 0.2580728530883789,
"learning_rate": 0.00016169656215411164,
"loss": 0.0247,
"step": 3220
},
{
"epoch": 1.6247484909456742,
"grad_norm": 0.46906596422195435,
"learning_rate": 0.00016143439738103564,
"loss": 0.0245,
"step": 3230
},
{
"epoch": 1.6297786720321932,
"grad_norm": 0.39322683215141296,
"learning_rate": 0.00016117155263605608,
"loss": 0.0282,
"step": 3240
},
{
"epoch": 1.6348088531187122,
"grad_norm": 0.2526596188545227,
"learning_rate": 0.00016090803082840707,
"loss": 0.025,
"step": 3250
},
{
"epoch": 1.6398390342052314,
"grad_norm": 0.3361895978450775,
"learning_rate": 0.00016064383487481655,
"loss": 0.0208,
"step": 3260
},
{
"epoch": 1.6448692152917506,
"grad_norm": 0.30806657671928406,
"learning_rate": 0.0001603789676994741,
"loss": 0.0289,
"step": 3270
},
{
"epoch": 1.6498993963782698,
"grad_norm": 0.47636348009109497,
"learning_rate": 0.00016011343223399865,
"loss": 0.0232,
"step": 3280
},
{
"epoch": 1.6549295774647887,
"grad_norm": 0.3957202732563019,
"learning_rate": 0.00015984723141740576,
"loss": 0.0338,
"step": 3290
},
{
"epoch": 1.6599597585513077,
"grad_norm": 0.3279256820678711,
"learning_rate": 0.0001595803681960754,
"loss": 0.032,
"step": 3300
},
{
"epoch": 1.664989939637827,
"grad_norm": 0.1826469749212265,
"learning_rate": 0.00015931284552371918,
"loss": 0.0202,
"step": 3310
},
{
"epoch": 1.6700201207243461,
"grad_norm": 0.37201932072639465,
"learning_rate": 0.00015904466636134772,
"loss": 0.0272,
"step": 3320
},
{
"epoch": 1.6750503018108653,
"grad_norm": 0.3043290078639984,
"learning_rate": 0.00015877583367723773,
"loss": 0.0313,
"step": 3330
},
{
"epoch": 1.6800804828973843,
"grad_norm": 0.31204384565353394,
"learning_rate": 0.00015850635044689938,
"loss": 0.0275,
"step": 3340
},
{
"epoch": 1.6851106639839033,
"grad_norm": 0.22221866250038147,
"learning_rate": 0.00015823621965304325,
"loss": 0.0254,
"step": 3350
},
{
"epoch": 1.6901408450704225,
"grad_norm": 0.22048987448215485,
"learning_rate": 0.00015796544428554725,
"loss": 0.026,
"step": 3360
},
{
"epoch": 1.6951710261569417,
"grad_norm": 0.34224534034729004,
"learning_rate": 0.00015769402734142367,
"loss": 0.0329,
"step": 3370
},
{
"epoch": 1.700201207243461,
"grad_norm": 0.24496717751026154,
"learning_rate": 0.00015742197182478586,
"loss": 0.0233,
"step": 3380
},
{
"epoch": 1.70523138832998,
"grad_norm": 0.3360805809497833,
"learning_rate": 0.00015714928074681515,
"loss": 0.0272,
"step": 3390
},
{
"epoch": 1.7102615694164989,
"grad_norm": 0.21322770416736603,
"learning_rate": 0.00015687595712572737,
"loss": 0.0245,
"step": 3400
},
{
"epoch": 1.715291750503018,
"grad_norm": 0.31249675154685974,
"learning_rate": 0.00015660200398673948,
"loss": 0.0254,
"step": 3410
},
{
"epoch": 1.7203219315895373,
"grad_norm": 0.37619125843048096,
"learning_rate": 0.00015632742436203615,
"loss": 0.0292,
"step": 3420
},
{
"epoch": 1.7253521126760565,
"grad_norm": 0.29316648840904236,
"learning_rate": 0.00015605222129073617,
"loss": 0.0264,
"step": 3430
},
{
"epoch": 1.7303822937625755,
"grad_norm": 0.31073886156082153,
"learning_rate": 0.0001557763978188588,
"loss": 0.0322,
"step": 3440
},
{
"epoch": 1.7354124748490944,
"grad_norm": 0.3250234127044678,
"learning_rate": 0.00015549995699929,
"loss": 0.0338,
"step": 3450
},
{
"epoch": 1.7404426559356136,
"grad_norm": 0.3571024537086487,
"learning_rate": 0.00015522290189174867,
"loss": 0.028,
"step": 3460
},
{
"epoch": 1.7454728370221329,
"grad_norm": 0.3457786440849304,
"learning_rate": 0.000154945235562753,
"loss": 0.0289,
"step": 3470
},
{
"epoch": 1.750503018108652,
"grad_norm": 0.3116254508495331,
"learning_rate": 0.00015466696108558611,
"loss": 0.024,
"step": 3480
},
{
"epoch": 1.755533199195171,
"grad_norm": 0.43166661262512207,
"learning_rate": 0.00015438808154026238,
"loss": 0.0305,
"step": 3490
},
{
"epoch": 1.76056338028169,
"grad_norm": 0.3314637541770935,
"learning_rate": 0.0001541086000134932,
"loss": 0.023,
"step": 3500
},
{
"epoch": 1.7655935613682092,
"grad_norm": 0.407803475856781,
"learning_rate": 0.0001538285195986529,
"loss": 0.029,
"step": 3510
},
{
"epoch": 1.7706237424547284,
"grad_norm": 0.28635403513908386,
"learning_rate": 0.0001535478433957444,
"loss": 0.0302,
"step": 3520
},
{
"epoch": 1.7756539235412476,
"grad_norm": 0.43160927295684814,
"learning_rate": 0.000153266574511365,
"loss": 0.0271,
"step": 3530
},
{
"epoch": 1.7806841046277666,
"grad_norm": 0.4747959077358246,
"learning_rate": 0.0001529847160586719,
"loss": 0.034,
"step": 3540
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.2701944410800934,
"learning_rate": 0.0001527022711573479,
"loss": 0.028,
"step": 3550
},
{
"epoch": 1.7907444668008048,
"grad_norm": 0.20618535578250885,
"learning_rate": 0.00015241924293356665,
"loss": 0.0311,
"step": 3560
},
{
"epoch": 1.795774647887324,
"grad_norm": 0.3267618417739868,
"learning_rate": 0.0001521356345199582,
"loss": 0.0358,
"step": 3570
},
{
"epoch": 1.8008048289738432,
"grad_norm": 0.24747027456760406,
"learning_rate": 0.0001518514490555743,
"loss": 0.0296,
"step": 3580
},
{
"epoch": 1.8058350100603622,
"grad_norm": 0.23700428009033203,
"learning_rate": 0.00015156668968585368,
"loss": 0.0313,
"step": 3590
},
{
"epoch": 1.8108651911468812,
"grad_norm": 0.29978063702583313,
"learning_rate": 0.0001512813595625872,
"loss": 0.0321,
"step": 3600
},
{
"epoch": 1.8158953722334004,
"grad_norm": 0.33228403329849243,
"learning_rate": 0.00015099546184388287,
"loss": 0.0305,
"step": 3610
},
{
"epoch": 1.8209255533199196,
"grad_norm": 0.3012440502643585,
"learning_rate": 0.00015070899969413115,
"loss": 0.0264,
"step": 3620
},
{
"epoch": 1.8259557344064388,
"grad_norm": 0.31137219071388245,
"learning_rate": 0.00015042197628396966,
"loss": 0.0276,
"step": 3630
},
{
"epoch": 1.8309859154929577,
"grad_norm": 0.2960556745529175,
"learning_rate": 0.0001501343947902482,
"loss": 0.0244,
"step": 3640
},
{
"epoch": 1.8360160965794767,
"grad_norm": 0.41761451959609985,
"learning_rate": 0.0001498462583959936,
"loss": 0.0271,
"step": 3650
},
{
"epoch": 1.841046277665996,
"grad_norm": 0.2327389270067215,
"learning_rate": 0.00014955757029037455,
"loss": 0.0209,
"step": 3660
},
{
"epoch": 1.8460764587525151,
"grad_norm": 0.4204856753349304,
"learning_rate": 0.0001492683336686661,
"loss": 0.0301,
"step": 3670
},
{
"epoch": 1.8511066398390343,
"grad_norm": 0.31292521953582764,
"learning_rate": 0.00014897855173221452,
"loss": 0.0248,
"step": 3680
},
{
"epoch": 1.8561368209255533,
"grad_norm": 0.304541677236557,
"learning_rate": 0.00014868822768840162,
"loss": 0.0237,
"step": 3690
},
{
"epoch": 1.8611670020120723,
"grad_norm": 0.41588783264160156,
"learning_rate": 0.00014839736475060966,
"loss": 0.0268,
"step": 3700
},
{
"epoch": 1.8661971830985915,
"grad_norm": 0.376002699136734,
"learning_rate": 0.0001481059661381852,
"loss": 0.0287,
"step": 3710
},
{
"epoch": 1.8712273641851107,
"grad_norm": 0.3036446273326874,
"learning_rate": 0.00014781403507640404,
"loss": 0.0313,
"step": 3720
},
{
"epoch": 1.87625754527163,
"grad_norm": 0.15947189927101135,
"learning_rate": 0.00014752157479643515,
"loss": 0.0269,
"step": 3730
},
{
"epoch": 1.881287726358149,
"grad_norm": 0.3008073568344116,
"learning_rate": 0.00014722858853530502,
"loss": 0.031,
"step": 3740
},
{
"epoch": 1.8863179074446679,
"grad_norm": 0.34803229570388794,
"learning_rate": 0.00014693507953586192,
"loss": 0.0289,
"step": 3750
},
{
"epoch": 1.891348088531187,
"grad_norm": 0.47900477051734924,
"learning_rate": 0.0001466410510467398,
"loss": 0.0239,
"step": 3760
},
{
"epoch": 1.8963782696177063,
"grad_norm": 0.3730434775352478,
"learning_rate": 0.00014634650632232255,
"loss": 0.0247,
"step": 3770
},
{
"epoch": 1.9014084507042255,
"grad_norm": 0.2846371829509735,
"learning_rate": 0.00014605144862270782,
"loss": 0.0274,
"step": 3780
},
{
"epoch": 1.9064386317907445,
"grad_norm": 0.27134642004966736,
"learning_rate": 0.00014575588121367111,
"loss": 0.0295,
"step": 3790
},
{
"epoch": 1.9114688128772634,
"grad_norm": 0.2962862253189087,
"learning_rate": 0.00014545980736662943,
"loss": 0.029,
"step": 3800
},
{
"epoch": 1.9164989939637826,
"grad_norm": 0.4870172441005707,
"learning_rate": 0.00014516323035860515,
"loss": 0.0334,
"step": 3810
},
{
"epoch": 1.9215291750503019,
"grad_norm": 0.4422418475151062,
"learning_rate": 0.00014486615347218985,
"loss": 0.0236,
"step": 3820
},
{
"epoch": 1.926559356136821,
"grad_norm": 0.15602879226207733,
"learning_rate": 0.00014456857999550787,
"loss": 0.025,
"step": 3830
},
{
"epoch": 1.93158953722334,
"grad_norm": 0.3513808846473694,
"learning_rate": 0.00014427051322217987,
"loss": 0.0294,
"step": 3840
},
{
"epoch": 1.936619718309859,
"grad_norm": 0.40983107686042786,
"learning_rate": 0.00014397195645128645,
"loss": 0.0249,
"step": 3850
},
{
"epoch": 1.9416498993963782,
"grad_norm": 0.22102007269859314,
"learning_rate": 0.00014367291298733178,
"loss": 0.0296,
"step": 3860
},
{
"epoch": 1.9466800804828974,
"grad_norm": 0.2670123279094696,
"learning_rate": 0.00014337338614020672,
"loss": 0.028,
"step": 3870
},
{
"epoch": 1.9517102615694166,
"grad_norm": 0.3241349160671234,
"learning_rate": 0.00014307337922515238,
"loss": 0.0295,
"step": 3880
},
{
"epoch": 1.9567404426559356,
"grad_norm": 0.2833192050457001,
"learning_rate": 0.00014277289556272342,
"loss": 0.0354,
"step": 3890
},
{
"epoch": 1.9617706237424546,
"grad_norm": 0.31631726026535034,
"learning_rate": 0.0001424719384787512,
"loss": 0.0279,
"step": 3900
},
{
"epoch": 1.9668008048289738,
"grad_norm": 0.38899293541908264,
"learning_rate": 0.0001421705113043071,
"loss": 0.0223,
"step": 3910
},
{
"epoch": 1.971830985915493,
"grad_norm": 0.33592867851257324,
"learning_rate": 0.00014186861737566558,
"loss": 0.0256,
"step": 3920
},
{
"epoch": 1.9768611670020122,
"grad_norm": 0.29194048047065735,
"learning_rate": 0.00014156626003426724,
"loss": 0.0285,
"step": 3930
},
{
"epoch": 1.9818913480885312,
"grad_norm": 0.5011776089668274,
"learning_rate": 0.00014126344262668176,
"loss": 0.0367,
"step": 3940
},
{
"epoch": 1.9869215291750502,
"grad_norm": 0.29997947812080383,
"learning_rate": 0.00014096016850457117,
"loss": 0.0283,
"step": 3950
},
{
"epoch": 1.9919517102615694,
"grad_norm": 0.26518872380256653,
"learning_rate": 0.0001406564410246523,
"loss": 0.0367,
"step": 3960
},
{
"epoch": 1.9969818913480886,
"grad_norm": 0.23170867562294006,
"learning_rate": 0.00014035226354866,
"loss": 0.0278,
"step": 3970
},
{
"epoch": 2.0020120724346078,
"grad_norm": 0.25075894594192505,
"learning_rate": 0.0001400476394433098,
"loss": 0.0248,
"step": 3980
},
{
"epoch": 2.007042253521127,
"grad_norm": 0.31776684522628784,
"learning_rate": 0.00013974257208026054,
"loss": 0.0239,
"step": 3990
},
{
"epoch": 2.0120724346076457,
"grad_norm": 0.33745163679122925,
"learning_rate": 0.00013943706483607725,
"loss": 0.0221,
"step": 4000
},
{
"epoch": 2.017102615694165,
"grad_norm": 0.21726344525814056,
"learning_rate": 0.00013913112109219364,
"loss": 0.0262,
"step": 4010
},
{
"epoch": 2.022132796780684,
"grad_norm": 0.3767722249031067,
"learning_rate": 0.0001388247442348747,
"loss": 0.023,
"step": 4020
},
{
"epoch": 2.0271629778672033,
"grad_norm": 0.4051326513290405,
"learning_rate": 0.00013851793765517925,
"loss": 0.0248,
"step": 4030
},
{
"epoch": 2.0321931589537225,
"grad_norm": 0.2341754138469696,
"learning_rate": 0.00013821070474892238,
"loss": 0.0229,
"step": 4040
},
{
"epoch": 2.0372233400402413,
"grad_norm": 0.30645281076431274,
"learning_rate": 0.00013790304891663792,
"loss": 0.0336,
"step": 4050
},
{
"epoch": 2.0422535211267605,
"grad_norm": 0.37985026836395264,
"learning_rate": 0.00013759497356354068,
"loss": 0.025,
"step": 4060
},
{
"epoch": 2.0472837022132797,
"grad_norm": 0.35980600118637085,
"learning_rate": 0.00013728648209948886,
"loss": 0.0344,
"step": 4070
},
{
"epoch": 2.052313883299799,
"grad_norm": 0.23638148605823517,
"learning_rate": 0.0001369775779389463,
"loss": 0.029,
"step": 4080
},
{
"epoch": 2.057344064386318,
"grad_norm": 0.21310564875602722,
"learning_rate": 0.00013666826450094467,
"loss": 0.0236,
"step": 4090
},
{
"epoch": 2.062374245472837,
"grad_norm": 0.256998747587204,
"learning_rate": 0.00013635854520904563,
"loss": 0.023,
"step": 4100
},
{
"epoch": 2.067404426559356,
"grad_norm": 0.23511482775211334,
"learning_rate": 0.00013604842349130295,
"loss": 0.0202,
"step": 4110
},
{
"epoch": 2.0724346076458753,
"grad_norm": 0.3412628769874573,
"learning_rate": 0.00013573790278022452,
"loss": 0.023,
"step": 4120
},
{
"epoch": 2.0774647887323945,
"grad_norm": 0.3615216314792633,
"learning_rate": 0.0001354269865127344,
"loss": 0.0217,
"step": 4130
},
{
"epoch": 2.0824949698189137,
"grad_norm": 0.17151637375354767,
"learning_rate": 0.0001351156781301348,
"loss": 0.019,
"step": 4140
},
{
"epoch": 2.0875251509054324,
"grad_norm": 0.29277974367141724,
"learning_rate": 0.0001348039810780679,
"loss": 0.0294,
"step": 4150
},
{
"epoch": 2.0925553319919517,
"grad_norm": 0.2807733714580536,
"learning_rate": 0.00013449189880647782,
"loss": 0.0195,
"step": 4160
},
{
"epoch": 2.097585513078471,
"grad_norm": 0.31441447138786316,
"learning_rate": 0.00013417943476957248,
"loss": 0.0203,
"step": 4170
},
{
"epoch": 2.10261569416499,
"grad_norm": 0.3053465783596039,
"learning_rate": 0.0001338665924257851,
"loss": 0.0243,
"step": 4180
},
{
"epoch": 2.1076458752515093,
"grad_norm": 0.22241270542144775,
"learning_rate": 0.0001335533752377362,
"loss": 0.0233,
"step": 4190
},
{
"epoch": 2.112676056338028,
"grad_norm": 0.2186601608991623,
"learning_rate": 0.00013323978667219513,
"loss": 0.0215,
"step": 4200
},
{
"epoch": 2.1177062374245472,
"grad_norm": 0.2129117101430893,
"learning_rate": 0.00013292583020004184,
"loss": 0.0236,
"step": 4210
},
{
"epoch": 2.1227364185110664,
"grad_norm": 0.40424126386642456,
"learning_rate": 0.00013261150929622822,
"loss": 0.0286,
"step": 4220
},
{
"epoch": 2.1277665995975856,
"grad_norm": 0.27489861845970154,
"learning_rate": 0.00013229682743973992,
"loss": 0.0241,
"step": 4230
},
{
"epoch": 2.132796780684105,
"grad_norm": 0.20953650772571564,
"learning_rate": 0.0001319817881135576,
"loss": 0.0214,
"step": 4240
},
{
"epoch": 2.1378269617706236,
"grad_norm": 0.3081333041191101,
"learning_rate": 0.0001316663948046186,
"loss": 0.0269,
"step": 4250
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.28377407789230347,
"learning_rate": 0.00013135065100377814,
"loss": 0.0207,
"step": 4260
},
{
"epoch": 2.147887323943662,
"grad_norm": 0.28675928711891174,
"learning_rate": 0.00013103456020577078,
"loss": 0.0251,
"step": 4270
},
{
"epoch": 2.152917505030181,
"grad_norm": 0.21341602504253387,
"learning_rate": 0.00013071812590917175,
"loss": 0.0244,
"step": 4280
},
{
"epoch": 2.1579476861167004,
"grad_norm": 0.27167797088623047,
"learning_rate": 0.0001304013516163583,
"loss": 0.0348,
"step": 4290
},
{
"epoch": 2.162977867203219,
"grad_norm": 0.2633381187915802,
"learning_rate": 0.00013008424083347072,
"loss": 0.0174,
"step": 4300
},
{
"epoch": 2.1680080482897384,
"grad_norm": 0.30004456639289856,
"learning_rate": 0.00012976679707037367,
"loss": 0.0212,
"step": 4310
},
{
"epoch": 2.1730382293762576,
"grad_norm": 0.2568664252758026,
"learning_rate": 0.00012944902384061746,
"loss": 0.0222,
"step": 4320
},
{
"epoch": 2.1780684104627768,
"grad_norm": 0.4086577594280243,
"learning_rate": 0.00012913092466139883,
"loss": 0.0205,
"step": 4330
},
{
"epoch": 2.183098591549296,
"grad_norm": 0.34458601474761963,
"learning_rate": 0.00012881250305352233,
"loss": 0.0166,
"step": 4340
},
{
"epoch": 2.1881287726358147,
"grad_norm": 0.20635934174060822,
"learning_rate": 0.00012849376254136125,
"loss": 0.0232,
"step": 4350
},
{
"epoch": 2.193158953722334,
"grad_norm": 0.39725396037101746,
"learning_rate": 0.00012817470665281853,
"loss": 0.0259,
"step": 4360
},
{
"epoch": 2.198189134808853,
"grad_norm": 0.23941123485565186,
"learning_rate": 0.0001278553389192878,
"loss": 0.0243,
"step": 4370
},
{
"epoch": 2.2032193158953723,
"grad_norm": 0.25489136576652527,
"learning_rate": 0.00012753566287561428,
"loss": 0.0234,
"step": 4380
},
{
"epoch": 2.2082494969818915,
"grad_norm": 0.24230477213859558,
"learning_rate": 0.00012721568206005562,
"loss": 0.0283,
"step": 4390
},
{
"epoch": 2.2132796780684103,
"grad_norm": 0.1920616328716278,
"learning_rate": 0.00012689540001424284,
"loss": 0.0176,
"step": 4400
},
{
"epoch": 2.2183098591549295,
"grad_norm": 0.34041905403137207,
"learning_rate": 0.00012657482028314096,
"loss": 0.019,
"step": 4410
},
{
"epoch": 2.2233400402414487,
"grad_norm": 0.16888068616390228,
"learning_rate": 0.00012625394641500995,
"loss": 0.0212,
"step": 4420
},
{
"epoch": 2.228370221327968,
"grad_norm": 0.30867740511894226,
"learning_rate": 0.00012593278196136525,
"loss": 0.0209,
"step": 4430
},
{
"epoch": 2.233400402414487,
"grad_norm": 0.2323102056980133,
"learning_rate": 0.0001256113304769387,
"loss": 0.0269,
"step": 4440
},
{
"epoch": 2.238430583501006,
"grad_norm": 0.2643173038959503,
"learning_rate": 0.00012528959551963897,
"loss": 0.0204,
"step": 4450
},
{
"epoch": 2.243460764587525,
"grad_norm": 0.26139459013938904,
"learning_rate": 0.00012496758065051234,
"loss": 0.0277,
"step": 4460
},
{
"epoch": 2.2484909456740443,
"grad_norm": 0.5168742537498474,
"learning_rate": 0.0001246452894337032,
"loss": 0.0254,
"step": 4470
},
{
"epoch": 2.2535211267605635,
"grad_norm": 0.4557676911354065,
"learning_rate": 0.0001243227254364147,
"loss": 0.0226,
"step": 4480
},
{
"epoch": 2.2585513078470827,
"grad_norm": 0.3516198396682739,
"learning_rate": 0.00012399989222886906,
"loss": 0.0243,
"step": 4490
},
{
"epoch": 2.2635814889336014,
"grad_norm": 0.20463688671588898,
"learning_rate": 0.00012367679338426833,
"loss": 0.0363,
"step": 4500
},
{
"epoch": 2.2686116700201207,
"grad_norm": 0.384490042924881,
"learning_rate": 0.00012335343247875456,
"loss": 0.0248,
"step": 4510
},
{
"epoch": 2.27364185110664,
"grad_norm": 0.24953685700893402,
"learning_rate": 0.00012302981309137052,
"loss": 0.0327,
"step": 4520
},
{
"epoch": 2.278672032193159,
"grad_norm": 0.25503483414649963,
"learning_rate": 0.00012270593880401974,
"loss": 0.0289,
"step": 4530
},
{
"epoch": 2.2837022132796783,
"grad_norm": 0.2341834008693695,
"learning_rate": 0.00012238181320142722,
"loss": 0.0222,
"step": 4540
},
{
"epoch": 2.288732394366197,
"grad_norm": 0.27594631910324097,
"learning_rate": 0.0001220574398710995,
"loss": 0.0298,
"step": 4550
},
{
"epoch": 2.2937625754527162,
"grad_norm": 0.26579129695892334,
"learning_rate": 0.00012173282240328505,
"loss": 0.0225,
"step": 4560
},
{
"epoch": 2.2987927565392354,
"grad_norm": 0.2124788463115692,
"learning_rate": 0.00012140796439093447,
"loss": 0.0237,
"step": 4570
},
{
"epoch": 2.3038229376257546,
"grad_norm": 0.2432093769311905,
"learning_rate": 0.00012108286942966085,
"loss": 0.0264,
"step": 4580
},
{
"epoch": 2.308853118712274,
"grad_norm": 0.2657240629196167,
"learning_rate": 0.00012075754111769984,
"loss": 0.0193,
"step": 4590
},
{
"epoch": 2.3138832997987926,
"grad_norm": 0.21074149012565613,
"learning_rate": 0.00012043198305586986,
"loss": 0.0176,
"step": 4600
},
{
"epoch": 2.318913480885312,
"grad_norm": 0.29499852657318115,
"learning_rate": 0.00012010619884753238,
"loss": 0.0268,
"step": 4610
},
{
"epoch": 2.323943661971831,
"grad_norm": 0.12746977806091309,
"learning_rate": 0.00011978019209855174,
"loss": 0.0241,
"step": 4620
},
{
"epoch": 2.32897384305835,
"grad_norm": 0.18850328028202057,
"learning_rate": 0.00011945396641725554,
"loss": 0.0165,
"step": 4630
},
{
"epoch": 2.3340040241448694,
"grad_norm": 0.26257720589637756,
"learning_rate": 0.00011912752541439455,
"loss": 0.0239,
"step": 4640
},
{
"epoch": 2.339034205231388,
"grad_norm": 0.399151086807251,
"learning_rate": 0.00011880087270310278,
"loss": 0.0345,
"step": 4650
},
{
"epoch": 2.3440643863179074,
"grad_norm": 0.308682382106781,
"learning_rate": 0.00011847401189885745,
"loss": 0.0268,
"step": 4660
},
{
"epoch": 2.3490945674044266,
"grad_norm": 0.3351981043815613,
"learning_rate": 0.00011814694661943906,
"loss": 0.0262,
"step": 4670
},
{
"epoch": 2.3541247484909458,
"grad_norm": 0.2986966371536255,
"learning_rate": 0.00011781968048489127,
"loss": 0.0199,
"step": 4680
},
{
"epoch": 2.359154929577465,
"grad_norm": 0.18461820483207703,
"learning_rate": 0.00011749221711748086,
"loss": 0.0295,
"step": 4690
},
{
"epoch": 2.3641851106639837,
"grad_norm": 0.33392488956451416,
"learning_rate": 0.0001171645601416576,
"loss": 0.0222,
"step": 4700
},
{
"epoch": 2.369215291750503,
"grad_norm": 0.3629428446292877,
"learning_rate": 0.0001168367131840142,
"loss": 0.0228,
"step": 4710
},
{
"epoch": 2.374245472837022,
"grad_norm": 0.3002311587333679,
"learning_rate": 0.00011650867987324614,
"loss": 0.0209,
"step": 4720
},
{
"epoch": 2.3792756539235413,
"grad_norm": 0.24123568832874298,
"learning_rate": 0.00011618046384011152,
"loss": 0.0189,
"step": 4730
},
{
"epoch": 2.3843058350100605,
"grad_norm": 0.3153541386127472,
"learning_rate": 0.0001158520687173908,
"loss": 0.0306,
"step": 4740
},
{
"epoch": 2.3893360160965793,
"grad_norm": 0.36185353994369507,
"learning_rate": 0.00011552349813984678,
"loss": 0.0181,
"step": 4750
},
{
"epoch": 2.3943661971830985,
"grad_norm": 0.21552976965904236,
"learning_rate": 0.00011519475574418405,
"loss": 0.0199,
"step": 4760
},
{
"epoch": 2.3993963782696177,
"grad_norm": 0.3644528388977051,
"learning_rate": 0.00011486584516900907,
"loss": 0.0251,
"step": 4770
},
{
"epoch": 2.404426559356137,
"grad_norm": 0.3115015923976898,
"learning_rate": 0.00011453677005478968,
"loss": 0.0343,
"step": 4780
},
{
"epoch": 2.409456740442656,
"grad_norm": 0.41604316234588623,
"learning_rate": 0.00011420753404381499,
"loss": 0.03,
"step": 4790
},
{
"epoch": 2.414486921529175,
"grad_norm": 0.3559543192386627,
"learning_rate": 0.00011387814078015482,
"loss": 0.0246,
"step": 4800
},
{
"epoch": 2.419517102615694,
"grad_norm": 0.1890721470117569,
"learning_rate": 0.00011354859390961958,
"loss": 0.0209,
"step": 4810
},
{
"epoch": 2.4245472837022133,
"grad_norm": 0.30657851696014404,
"learning_rate": 0.00011321889707971979,
"loss": 0.02,
"step": 4820
},
{
"epoch": 2.4295774647887325,
"grad_norm": 0.2877327501773834,
"learning_rate": 0.00011288905393962586,
"loss": 0.0237,
"step": 4830
},
{
"epoch": 2.4346076458752517,
"grad_norm": 0.3301834166049957,
"learning_rate": 0.00011255906814012744,
"loss": 0.0219,
"step": 4840
},
{
"epoch": 2.4396378269617705,
"grad_norm": 0.21552981436252594,
"learning_rate": 0.00011222894333359338,
"loss": 0.0221,
"step": 4850
},
{
"epoch": 2.4446680080482897,
"grad_norm": 0.33082151412963867,
"learning_rate": 0.00011189868317393086,
"loss": 0.021,
"step": 4860
},
{
"epoch": 2.449698189134809,
"grad_norm": 0.30314934253692627,
"learning_rate": 0.00011156829131654543,
"loss": 0.0311,
"step": 4870
},
{
"epoch": 2.454728370221328,
"grad_norm": 0.31226858496665955,
"learning_rate": 0.00011123777141830008,
"loss": 0.0272,
"step": 4880
},
{
"epoch": 2.4597585513078473,
"grad_norm": 0.32737886905670166,
"learning_rate": 0.00011090712713747514,
"loss": 0.0248,
"step": 4890
},
{
"epoch": 2.464788732394366,
"grad_norm": 0.2482103705406189,
"learning_rate": 0.00011057636213372755,
"loss": 0.0195,
"step": 4900
},
{
"epoch": 2.4698189134808852,
"grad_norm": 0.4342176616191864,
"learning_rate": 0.00011024548006805051,
"loss": 0.0265,
"step": 4910
},
{
"epoch": 2.4748490945674044,
"grad_norm": 0.3512166440486908,
"learning_rate": 0.00010991448460273287,
"loss": 0.0296,
"step": 4920
},
{
"epoch": 2.4798792756539236,
"grad_norm": 0.4724549949169159,
"learning_rate": 0.00010958337940131857,
"loss": 0.0265,
"step": 4930
},
{
"epoch": 2.484909456740443,
"grad_norm": 0.29837068915367126,
"learning_rate": 0.00010925216812856621,
"loss": 0.0249,
"step": 4940
},
{
"epoch": 2.4899396378269616,
"grad_norm": 0.14961381256580353,
"learning_rate": 0.00010892085445040836,
"loss": 0.0198,
"step": 4950
},
{
"epoch": 2.494969818913481,
"grad_norm": 0.2834800183773041,
"learning_rate": 0.00010858944203391106,
"loss": 0.0262,
"step": 4960
},
{
"epoch": 2.5,
"grad_norm": 0.3048977255821228,
"learning_rate": 0.00010825793454723325,
"loss": 0.0193,
"step": 4970
},
{
"epoch": 2.505030181086519,
"grad_norm": 0.42306941747665405,
"learning_rate": 0.00010792633565958603,
"loss": 0.0164,
"step": 4980
},
{
"epoch": 2.5100603621730384,
"grad_norm": 0.29460790753364563,
"learning_rate": 0.00010759464904119229,
"loss": 0.0219,
"step": 4990
},
{
"epoch": 2.515090543259557,
"grad_norm": 0.2361808717250824,
"learning_rate": 0.00010726287836324582,
"loss": 0.0229,
"step": 5000
},
{
"epoch": 2.5201207243460764,
"grad_norm": 0.2822340130805969,
"learning_rate": 0.00010693102729787088,
"loss": 0.03,
"step": 5010
},
{
"epoch": 2.5251509054325956,
"grad_norm": 0.3465881049633026,
"learning_rate": 0.00010659909951808145,
"loss": 0.0208,
"step": 5020
},
{
"epoch": 2.5301810865191148,
"grad_norm": 0.27967023849487305,
"learning_rate": 0.00010626709869774056,
"loss": 0.0257,
"step": 5030
},
{
"epoch": 2.535211267605634,
"grad_norm": 0.4234648644924164,
"learning_rate": 0.00010593502851151977,
"loss": 0.0245,
"step": 5040
},
{
"epoch": 2.5402414486921527,
"grad_norm": 0.2795238792896271,
"learning_rate": 0.00010560289263485836,
"loss": 0.0201,
"step": 5050
},
{
"epoch": 2.545271629778672,
"grad_norm": 0.27361616492271423,
"learning_rate": 0.00010527069474392266,
"loss": 0.0152,
"step": 5060
},
{
"epoch": 2.550301810865191,
"grad_norm": 0.21000875532627106,
"learning_rate": 0.00010493843851556539,
"loss": 0.0203,
"step": 5070
},
{
"epoch": 2.5553319919517103,
"grad_norm": 0.265424519777298,
"learning_rate": 0.00010460612762728498,
"loss": 0.0227,
"step": 5080
},
{
"epoch": 2.5603621730382295,
"grad_norm": 0.2341059446334839,
"learning_rate": 0.00010427376575718488,
"loss": 0.0257,
"step": 5090
},
{
"epoch": 2.5653923541247483,
"grad_norm": 0.3186612129211426,
"learning_rate": 0.00010394135658393278,
"loss": 0.0249,
"step": 5100
},
{
"epoch": 2.5704225352112675,
"grad_norm": 0.22894106805324554,
"learning_rate": 0.00010360890378671997,
"loss": 0.0222,
"step": 5110
},
{
"epoch": 2.5754527162977867,
"grad_norm": 0.32082125544548035,
"learning_rate": 0.00010327641104522052,
"loss": 0.0258,
"step": 5120
},
{
"epoch": 2.580482897384306,
"grad_norm": 0.21953271329402924,
"learning_rate": 0.00010294388203955067,
"loss": 0.0199,
"step": 5130
},
{
"epoch": 2.585513078470825,
"grad_norm": 0.14508609473705292,
"learning_rate": 0.00010261132045022804,
"loss": 0.0219,
"step": 5140
},
{
"epoch": 2.590543259557344,
"grad_norm": 0.23425619304180145,
"learning_rate": 0.00010227872995813083,
"loss": 0.0238,
"step": 5150
},
{
"epoch": 2.595573440643863,
"grad_norm": 0.18086619675159454,
"learning_rate": 0.00010194611424445721,
"loss": 0.0225,
"step": 5160
},
{
"epoch": 2.6006036217303823,
"grad_norm": 0.2463824599981308,
"learning_rate": 0.0001016134769906845,
"loss": 0.0199,
"step": 5170
},
{
"epoch": 2.6056338028169015,
"grad_norm": 0.23146170377731323,
"learning_rate": 0.00010128082187852846,
"loss": 0.0215,
"step": 5180
},
{
"epoch": 2.6106639839034207,
"grad_norm": 0.24659287929534912,
"learning_rate": 0.00010094815258990241,
"loss": 0.0192,
"step": 5190
},
{
"epoch": 2.6156941649899395,
"grad_norm": 0.18079569935798645,
"learning_rate": 0.00010061547280687664,
"loss": 0.0201,
"step": 5200
},
{
"epoch": 2.6207243460764587,
"grad_norm": 0.21298180520534515,
"learning_rate": 0.00010028278621163762,
"loss": 0.0208,
"step": 5210
},
{
"epoch": 2.625754527162978,
"grad_norm": 0.21694862842559814,
"learning_rate": 9.99500964864472e-05,
"loss": 0.0151,
"step": 5220
},
{
"epoch": 2.630784708249497,
"grad_norm": 0.2606973648071289,
"learning_rate": 9.961740731360184e-05,
"loss": 0.0219,
"step": 5230
},
{
"epoch": 2.6358148893360163,
"grad_norm": 0.34560972452163696,
"learning_rate": 9.928472237539196e-05,
"loss": 0.0188,
"step": 5240
},
{
"epoch": 2.640845070422535,
"grad_norm": 0.26653993129730225,
"learning_rate": 9.895204535406104e-05,
"loss": 0.0161,
"step": 5250
},
{
"epoch": 2.6458752515090542,
"grad_norm": 0.16782517731189728,
"learning_rate": 9.861937993176495e-05,
"loss": 0.0167,
"step": 5260
},
{
"epoch": 2.6509054325955734,
"grad_norm": 0.2072647660970688,
"learning_rate": 9.828672979053119e-05,
"loss": 0.0185,
"step": 5270
},
{
"epoch": 2.6559356136820926,
"grad_norm": 0.34195080399513245,
"learning_rate": 9.795409861221813e-05,
"loss": 0.0156,
"step": 5280
},
{
"epoch": 2.660965794768612,
"grad_norm": 0.3683469891548157,
"learning_rate": 9.762149007847424e-05,
"loss": 0.0166,
"step": 5290
},
{
"epoch": 2.6659959758551306,
"grad_norm": 0.2584269940853119,
"learning_rate": 9.728890787069737e-05,
"loss": 0.0241,
"step": 5300
},
{
"epoch": 2.67102615694165,
"grad_norm": 0.1505228877067566,
"learning_rate": 9.695635566999397e-05,
"loss": 0.0286,
"step": 5310
},
{
"epoch": 2.676056338028169,
"grad_norm": 0.27530187368392944,
"learning_rate": 9.662383715713837e-05,
"loss": 0.0243,
"step": 5320
},
{
"epoch": 2.681086519114688,
"grad_norm": 0.3398008942604065,
"learning_rate": 9.629135601253204e-05,
"loss": 0.0286,
"step": 5330
},
{
"epoch": 2.6861167002012074,
"grad_norm": 0.20086489617824554,
"learning_rate": 9.595891591616282e-05,
"loss": 0.0192,
"step": 5340
},
{
"epoch": 2.691146881287726,
"grad_norm": 0.25084254145622253,
"learning_rate": 9.562652054756429e-05,
"loss": 0.0277,
"step": 5350
},
{
"epoch": 2.6961770623742454,
"grad_norm": 0.25037065148353577,
"learning_rate": 9.529417358577489e-05,
"loss": 0.0214,
"step": 5360
},
{
"epoch": 2.7012072434607646,
"grad_norm": 0.18306289613246918,
"learning_rate": 9.496187870929732e-05,
"loss": 0.0157,
"step": 5370
},
{
"epoch": 2.7062374245472838,
"grad_norm": 0.17593130469322205,
"learning_rate": 9.462963959605778e-05,
"loss": 0.0206,
"step": 5380
},
{
"epoch": 2.711267605633803,
"grad_norm": 0.3547375500202179,
"learning_rate": 9.429745992336522e-05,
"loss": 0.0217,
"step": 5390
},
{
"epoch": 2.7162977867203217,
"grad_norm": 0.26571425795555115,
"learning_rate": 9.396534336787081e-05,
"loss": 0.0198,
"step": 5400
},
{
"epoch": 2.721327967806841,
"grad_norm": 0.3025442957878113,
"learning_rate": 9.363329360552703e-05,
"loss": 0.021,
"step": 5410
},
{
"epoch": 2.72635814889336,
"grad_norm": 0.30082857608795166,
"learning_rate": 9.330131431154708e-05,
"loss": 0.0214,
"step": 5420
},
{
"epoch": 2.7313883299798793,
"grad_norm": 0.18924900889396667,
"learning_rate": 9.29694091603642e-05,
"loss": 0.0141,
"step": 5430
},
{
"epoch": 2.7364185110663986,
"grad_norm": 0.28385961055755615,
"learning_rate": 9.263758182559103e-05,
"loss": 0.0235,
"step": 5440
},
{
"epoch": 2.7414486921529173,
"grad_norm": 0.3015013635158539,
"learning_rate": 9.230583597997888e-05,
"loss": 0.0234,
"step": 5450
},
{
"epoch": 2.7464788732394365,
"grad_norm": 0.3755188584327698,
"learning_rate": 9.197417529537716e-05,
"loss": 0.0206,
"step": 5460
},
{
"epoch": 2.7515090543259557,
"grad_norm": 0.2895854413509369,
"learning_rate": 9.16426034426926e-05,
"loss": 0.0266,
"step": 5470
},
{
"epoch": 2.756539235412475,
"grad_norm": 0.35291990637779236,
"learning_rate": 9.131112409184886e-05,
"loss": 0.0173,
"step": 5480
},
{
"epoch": 2.761569416498994,
"grad_norm": 0.2543307840824127,
"learning_rate": 9.097974091174568e-05,
"loss": 0.0173,
"step": 5490
},
{
"epoch": 2.766599597585513,
"grad_norm": 0.14231809973716736,
"learning_rate": 9.064845757021833e-05,
"loss": 0.0162,
"step": 5500
},
{
"epoch": 2.771629778672032,
"grad_norm": 0.23540771007537842,
"learning_rate": 9.031727773399709e-05,
"loss": 0.0187,
"step": 5510
},
{
"epoch": 2.7766599597585513,
"grad_norm": 0.2745067775249481,
"learning_rate": 8.998620506866663e-05,
"loss": 0.0195,
"step": 5520
},
{
"epoch": 2.7816901408450705,
"grad_norm": 0.22975705564022064,
"learning_rate": 8.965524323862535e-05,
"loss": 0.0201,
"step": 5530
},
{
"epoch": 2.7867203219315897,
"grad_norm": 0.2695607841014862,
"learning_rate": 8.9324395907045e-05,
"loss": 0.0152,
"step": 5540
},
{
"epoch": 2.7917505030181085,
"grad_norm": 0.3059997260570526,
"learning_rate": 8.899366673582994e-05,
"loss": 0.0205,
"step": 5550
},
{
"epoch": 2.7967806841046277,
"grad_norm": 0.2964552938938141,
"learning_rate": 8.866305938557675e-05,
"loss": 0.0216,
"step": 5560
},
{
"epoch": 2.801810865191147,
"grad_norm": 0.23506313562393188,
"learning_rate": 8.833257751553365e-05,
"loss": 0.0223,
"step": 5570
},
{
"epoch": 2.806841046277666,
"grad_norm": 0.2811063230037689,
"learning_rate": 8.800222478356e-05,
"loss": 0.0153,
"step": 5580
},
{
"epoch": 2.8118712273641853,
"grad_norm": 0.1559114307165146,
"learning_rate": 8.767200484608584e-05,
"loss": 0.0235,
"step": 5590
},
{
"epoch": 2.816901408450704,
"grad_norm": 0.3868955075740814,
"learning_rate": 8.734192135807142e-05,
"loss": 0.0238,
"step": 5600
},
{
"epoch": 2.8219315895372232,
"grad_norm": 0.3004298210144043,
"learning_rate": 8.701197797296671e-05,
"loss": 0.0205,
"step": 5610
},
{
"epoch": 2.8269617706237424,
"grad_norm": 0.30548134446144104,
"learning_rate": 8.668217834267096e-05,
"loss": 0.0213,
"step": 5620
},
{
"epoch": 2.8319919517102616,
"grad_norm": 0.4484052360057831,
"learning_rate": 8.635252611749239e-05,
"loss": 0.015,
"step": 5630
},
{
"epoch": 2.837022132796781,
"grad_norm": 0.2639616131782532,
"learning_rate": 8.602302494610764e-05,
"loss": 0.0241,
"step": 5640
},
{
"epoch": 2.8420523138832996,
"grad_norm": 0.290503591299057,
"learning_rate": 8.569367847552143e-05,
"loss": 0.0215,
"step": 5650
},
{
"epoch": 2.847082494969819,
"grad_norm": 0.2626757025718689,
"learning_rate": 8.536449035102624e-05,
"loss": 0.0249,
"step": 5660
},
{
"epoch": 2.852112676056338,
"grad_norm": 0.18214431405067444,
"learning_rate": 8.50354642161619e-05,
"loss": 0.0155,
"step": 5670
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.23727771639823914,
"learning_rate": 8.47066037126754e-05,
"loss": 0.027,
"step": 5680
},
{
"epoch": 2.8621730382293764,
"grad_norm": 0.2826462984085083,
"learning_rate": 8.437791248048037e-05,
"loss": 0.0189,
"step": 5690
},
{
"epoch": 2.867203219315895,
"grad_norm": 0.2718562185764313,
"learning_rate": 8.404939415761693e-05,
"loss": 0.0235,
"step": 5700
},
{
"epoch": 2.8722334004024144,
"grad_norm": 0.27361899614334106,
"learning_rate": 8.372105238021138e-05,
"loss": 0.0235,
"step": 5710
},
{
"epoch": 2.8772635814889336,
"grad_norm": 0.18473175168037415,
"learning_rate": 8.339289078243602e-05,
"loss": 0.0158,
"step": 5720
},
{
"epoch": 2.8822937625754528,
"grad_norm": 0.24013389647006989,
"learning_rate": 8.306491299646884e-05,
"loss": 0.0215,
"step": 5730
},
{
"epoch": 2.887323943661972,
"grad_norm": 0.22806933522224426,
"learning_rate": 8.273712265245336e-05,
"loss": 0.0148,
"step": 5740
},
{
"epoch": 2.8923541247484907,
"grad_norm": 0.23373086750507355,
"learning_rate": 8.240952337845844e-05,
"loss": 0.0224,
"step": 5750
},
{
"epoch": 2.89738430583501,
"grad_norm": 0.19605842232704163,
"learning_rate": 8.208211880043812e-05,
"loss": 0.0146,
"step": 5760
},
{
"epoch": 2.902414486921529,
"grad_norm": 0.2649703621864319,
"learning_rate": 8.175491254219151e-05,
"loss": 0.0168,
"step": 5770
},
{
"epoch": 2.9074446680080483,
"grad_norm": 0.1927386075258255,
"learning_rate": 8.142790822532266e-05,
"loss": 0.017,
"step": 5780
},
{
"epoch": 2.9124748490945676,
"grad_norm": 0.18656544387340546,
"learning_rate": 8.110110946920049e-05,
"loss": 0.0198,
"step": 5790
},
{
"epoch": 2.9175050301810863,
"grad_norm": 0.1952710896730423,
"learning_rate": 8.077451989091868e-05,
"loss": 0.0172,
"step": 5800
},
{
"epoch": 2.9225352112676055,
"grad_norm": 0.31542059779167175,
"learning_rate": 8.044814310525574e-05,
"loss": 0.024,
"step": 5810
},
{
"epoch": 2.9275653923541247,
"grad_norm": 0.22629360854625702,
"learning_rate": 8.012198272463486e-05,
"loss": 0.0234,
"step": 5820
},
{
"epoch": 2.932595573440644,
"grad_norm": 0.2527533769607544,
"learning_rate": 7.979604235908406e-05,
"loss": 0.0192,
"step": 5830
},
{
"epoch": 2.937625754527163,
"grad_norm": 0.27879974246025085,
"learning_rate": 7.947032561619617e-05,
"loss": 0.0162,
"step": 5840
},
{
"epoch": 2.942655935613682,
"grad_norm": 0.33129119873046875,
"learning_rate": 7.914483610108888e-05,
"loss": 0.0196,
"step": 5850
},
{
"epoch": 2.947686116700201,
"grad_norm": 0.29584264755249023,
"learning_rate": 7.881957741636486e-05,
"loss": 0.0226,
"step": 5860
},
{
"epoch": 2.9527162977867203,
"grad_norm": 0.3168732821941376,
"learning_rate": 7.849455316207197e-05,
"loss": 0.0178,
"step": 5870
},
{
"epoch": 2.9577464788732395,
"grad_norm": 0.1575564593076706,
"learning_rate": 7.816976693566324e-05,
"loss": 0.0221,
"step": 5880
},
{
"epoch": 2.9627766599597587,
"grad_norm": 0.2546273469924927,
"learning_rate": 7.784522233195716e-05,
"loss": 0.0167,
"step": 5890
},
{
"epoch": 2.9678068410462775,
"grad_norm": 0.19285175204277039,
"learning_rate": 7.75209229430979e-05,
"loss": 0.0196,
"step": 5900
},
{
"epoch": 2.9728370221327967,
"grad_norm": 0.25114384293556213,
"learning_rate": 7.719687235851554e-05,
"loss": 0.0224,
"step": 5910
},
{
"epoch": 2.977867203219316,
"grad_norm": 0.2990531623363495,
"learning_rate": 7.687307416488625e-05,
"loss": 0.0185,
"step": 5920
},
{
"epoch": 2.982897384305835,
"grad_norm": 0.19075430929660797,
"learning_rate": 7.654953194609281e-05,
"loss": 0.0238,
"step": 5930
},
{
"epoch": 2.9879275653923543,
"grad_norm": 0.28794053196907043,
"learning_rate": 7.62262492831847e-05,
"loss": 0.0198,
"step": 5940
},
{
"epoch": 2.992957746478873,
"grad_norm": 0.26047641038894653,
"learning_rate": 7.590322975433857e-05,
"loss": 0.014,
"step": 5950
},
{
"epoch": 2.9979879275653922,
"grad_norm": 0.2440265715122223,
"learning_rate": 7.558047693481868e-05,
"loss": 0.017,
"step": 5960
},
{
"epoch": 3.0030181086519114,
"grad_norm": 0.21606865525245667,
"learning_rate": 7.525799439693723e-05,
"loss": 0.0173,
"step": 5970
},
{
"epoch": 3.0080482897384306,
"grad_norm": 0.24635550379753113,
"learning_rate": 7.493578571001497e-05,
"loss": 0.0117,
"step": 5980
},
{
"epoch": 3.01307847082495,
"grad_norm": 0.17602556943893433,
"learning_rate": 7.461385444034145e-05,
"loss": 0.0149,
"step": 5990
},
{
"epoch": 3.0181086519114686,
"grad_norm": 0.2443585842847824,
"learning_rate": 7.429220415113582e-05,
"loss": 0.02,
"step": 6000
},
{
"epoch": 3.023138832997988,
"grad_norm": 0.25802722573280334,
"learning_rate": 7.39708384025072e-05,
"loss": 0.0185,
"step": 6010
},
{
"epoch": 3.028169014084507,
"grad_norm": 0.1731959879398346,
"learning_rate": 7.364976075141534e-05,
"loss": 0.0159,
"step": 6020
},
{
"epoch": 3.033199195171026,
"grad_norm": 0.2295977920293808,
"learning_rate": 7.33289747516313e-05,
"loss": 0.0223,
"step": 6030
},
{
"epoch": 3.0382293762575454,
"grad_norm": 0.27841848134994507,
"learning_rate": 7.300848395369801e-05,
"loss": 0.0246,
"step": 6040
},
{
"epoch": 3.043259557344064,
"grad_norm": 0.2819521725177765,
"learning_rate": 7.268829190489111e-05,
"loss": 0.0175,
"step": 6050
},
{
"epoch": 3.0482897384305834,
"grad_norm": 0.32469579577445984,
"learning_rate": 7.236840214917949e-05,
"loss": 0.0179,
"step": 6060
},
{
"epoch": 3.0533199195171026,
"grad_norm": 0.28703153133392334,
"learning_rate": 7.204881822718625e-05,
"loss": 0.0191,
"step": 6070
},
{
"epoch": 3.058350100603622,
"grad_norm": 0.27047568559646606,
"learning_rate": 7.172954367614945e-05,
"loss": 0.0162,
"step": 6080
},
{
"epoch": 3.063380281690141,
"grad_norm": 0.2722102403640747,
"learning_rate": 7.141058202988293e-05,
"loss": 0.0189,
"step": 6090
},
{
"epoch": 3.0684104627766597,
"grad_norm": 0.2391163408756256,
"learning_rate": 7.109193681873726e-05,
"loss": 0.0175,
"step": 6100
},
{
"epoch": 3.073440643863179,
"grad_norm": 0.4041613042354584,
"learning_rate": 7.077361156956055e-05,
"loss": 0.021,
"step": 6110
},
{
"epoch": 3.078470824949698,
"grad_norm": 0.2219713032245636,
"learning_rate": 7.045560980565957e-05,
"loss": 0.0169,
"step": 6120
},
{
"epoch": 3.0835010060362174,
"grad_norm": 0.21718035638332367,
"learning_rate": 7.013793504676061e-05,
"loss": 0.0194,
"step": 6130
},
{
"epoch": 3.0885311871227366,
"grad_norm": 0.19979757070541382,
"learning_rate": 6.982059080897059e-05,
"loss": 0.0165,
"step": 6140
},
{
"epoch": 3.0935613682092553,
"grad_norm": 0.2233651578426361,
"learning_rate": 6.950358060473814e-05,
"loss": 0.0151,
"step": 6150
},
{
"epoch": 3.0985915492957745,
"grad_norm": 0.24543243646621704,
"learning_rate": 6.918690794281475e-05,
"loss": 0.019,
"step": 6160
},
{
"epoch": 3.1036217303822937,
"grad_norm": 0.3978748917579651,
"learning_rate": 6.887057632821587e-05,
"loss": 0.0216,
"step": 6170
},
{
"epoch": 3.108651911468813,
"grad_norm": 0.22909055650234222,
"learning_rate": 6.855458926218219e-05,
"loss": 0.0206,
"step": 6180
},
{
"epoch": 3.113682092555332,
"grad_norm": 0.32850995659828186,
"learning_rate": 6.823895024214077e-05,
"loss": 0.0145,
"step": 6190
},
{
"epoch": 3.118712273641851,
"grad_norm": 0.22510278224945068,
"learning_rate": 6.79236627616665e-05,
"loss": 0.0215,
"step": 6200
},
{
"epoch": 3.12374245472837,
"grad_norm": 0.2886231541633606,
"learning_rate": 6.760873031044328e-05,
"loss": 0.0152,
"step": 6210
},
{
"epoch": 3.1287726358148893,
"grad_norm": 0.17671062052249908,
"learning_rate": 6.729415637422543e-05,
"loss": 0.0242,
"step": 6220
},
{
"epoch": 3.1338028169014085,
"grad_norm": 0.3519156873226166,
"learning_rate": 6.69799444347992e-05,
"loss": 0.0158,
"step": 6230
},
{
"epoch": 3.1388329979879277,
"grad_norm": 0.2459878772497177,
"learning_rate": 6.666609796994416e-05,
"loss": 0.0175,
"step": 6240
},
{
"epoch": 3.1438631790744465,
"grad_norm": 0.20523589849472046,
"learning_rate": 6.635262045339465e-05,
"loss": 0.0174,
"step": 6250
},
{
"epoch": 3.1488933601609657,
"grad_norm": 0.23958848416805267,
"learning_rate": 6.603951535480146e-05,
"loss": 0.0215,
"step": 6260
},
{
"epoch": 3.153923541247485,
"grad_norm": 0.21609632670879364,
"learning_rate": 6.572678613969331e-05,
"loss": 0.0205,
"step": 6270
},
{
"epoch": 3.158953722334004,
"grad_norm": 0.20294173061847687,
"learning_rate": 6.541443626943855e-05,
"loss": 0.0177,
"step": 6280
},
{
"epoch": 3.1639839034205233,
"grad_norm": 0.21324007213115692,
"learning_rate": 6.51024692012069e-05,
"loss": 0.0187,
"step": 6290
},
{
"epoch": 3.169014084507042,
"grad_norm": 0.42404302954673767,
"learning_rate": 6.479088838793106e-05,
"loss": 0.0205,
"step": 6300
},
{
"epoch": 3.1740442655935612,
"grad_norm": 0.2945919334888458,
"learning_rate": 6.447969727826859e-05,
"loss": 0.0148,
"step": 6310
},
{
"epoch": 3.1790744466800804,
"grad_norm": 0.3685127794742584,
"learning_rate": 6.41688993165637e-05,
"loss": 0.0194,
"step": 6320
},
{
"epoch": 3.1841046277665996,
"grad_norm": 0.295194536447525,
"learning_rate": 6.385849794280915e-05,
"loss": 0.0202,
"step": 6330
},
{
"epoch": 3.189134808853119,
"grad_norm": 0.2186952531337738,
"learning_rate": 6.354849659260815e-05,
"loss": 0.0172,
"step": 6340
},
{
"epoch": 3.1941649899396376,
"grad_norm": 0.22984078526496887,
"learning_rate": 6.323889869713637e-05,
"loss": 0.015,
"step": 6350
},
{
"epoch": 3.199195171026157,
"grad_norm": 0.2033555507659912,
"learning_rate": 6.292970768310387e-05,
"loss": 0.0145,
"step": 6360
},
{
"epoch": 3.204225352112676,
"grad_norm": 0.3300851583480835,
"learning_rate": 6.262092697271732e-05,
"loss": 0.0202,
"step": 6370
},
{
"epoch": 3.209255533199195,
"grad_norm": 0.2983047366142273,
"learning_rate": 6.2312559983642e-05,
"loss": 0.0174,
"step": 6380
},
{
"epoch": 3.2142857142857144,
"grad_norm": 0.1934136003255844,
"learning_rate": 6.200461012896402e-05,
"loss": 0.0233,
"step": 6390
},
{
"epoch": 3.219315895372233,
"grad_norm": 0.23245446383953094,
"learning_rate": 6.169708081715253e-05,
"loss": 0.018,
"step": 6400
},
{
"epoch": 3.2243460764587524,
"grad_norm": 0.24971647560596466,
"learning_rate": 6.1389975452022e-05,
"loss": 0.0167,
"step": 6410
},
{
"epoch": 3.2293762575452716,
"grad_norm": 0.23975920677185059,
"learning_rate": 6.108329743269461e-05,
"loss": 0.014,
"step": 6420
},
{
"epoch": 3.234406438631791,
"grad_norm": 0.14767713844776154,
"learning_rate": 6.07770501535625e-05,
"loss": 0.019,
"step": 6430
},
{
"epoch": 3.23943661971831,
"grad_norm": 0.22770993411540985,
"learning_rate": 6.047123700425026e-05,
"loss": 0.0128,
"step": 6440
},
{
"epoch": 3.2444668008048287,
"grad_norm": 0.2288396656513214,
"learning_rate": 6.016586136957745e-05,
"loss": 0.0173,
"step": 6450
},
{
"epoch": 3.249496981891348,
"grad_norm": 0.1313876062631607,
"learning_rate": 5.98609266295211e-05,
"loss": 0.0131,
"step": 6460
},
{
"epoch": 3.254527162977867,
"grad_norm": 0.18414969742298126,
"learning_rate": 5.955643615917825e-05,
"loss": 0.016,
"step": 6470
},
{
"epoch": 3.2595573440643864,
"grad_norm": 0.19654971361160278,
"learning_rate": 5.925239332872878e-05,
"loss": 0.0128,
"step": 6480
},
{
"epoch": 3.2645875251509056,
"grad_norm": 0.1665862649679184,
"learning_rate": 5.894880150339783e-05,
"loss": 0.0148,
"step": 6490
},
{
"epoch": 3.2696177062374243,
"grad_norm": 0.1726180911064148,
"learning_rate": 5.864566404341878e-05,
"loss": 0.0147,
"step": 6500
},
{
"epoch": 3.2746478873239435,
"grad_norm": 0.2508947253227234,
"learning_rate": 5.834298430399594e-05,
"loss": 0.0212,
"step": 6510
},
{
"epoch": 3.2796780684104627,
"grad_norm": 0.18195697665214539,
"learning_rate": 5.804076563526744e-05,
"loss": 0.0163,
"step": 6520
},
{
"epoch": 3.284708249496982,
"grad_norm": 0.38305532932281494,
"learning_rate": 5.773901138226826e-05,
"loss": 0.0133,
"step": 6530
},
{
"epoch": 3.289738430583501,
"grad_norm": 0.19784590601921082,
"learning_rate": 5.743772488489294e-05,
"loss": 0.0191,
"step": 6540
},
{
"epoch": 3.29476861167002,
"grad_norm": 0.11469227820634842,
"learning_rate": 5.71369094778589e-05,
"loss": 0.0188,
"step": 6550
},
{
"epoch": 3.299798792756539,
"grad_norm": 0.2073812335729599,
"learning_rate": 5.6836568490669384e-05,
"loss": 0.0117,
"step": 6560
},
{
"epoch": 3.3048289738430583,
"grad_norm": 0.12024883180856705,
"learning_rate": 5.653670524757667e-05,
"loss": 0.0167,
"step": 6570
},
{
"epoch": 3.3098591549295775,
"grad_norm": 0.1882588267326355,
"learning_rate": 5.623732306754511e-05,
"loss": 0.0123,
"step": 6580
},
{
"epoch": 3.3148893360160967,
"grad_norm": 0.13178786635398865,
"learning_rate": 5.5938425264214657e-05,
"loss": 0.0156,
"step": 6590
},
{
"epoch": 3.3199195171026155,
"grad_norm": 0.1943099945783615,
"learning_rate": 5.564001514586403e-05,
"loss": 0.0117,
"step": 6600
},
{
"epoch": 3.3249496981891347,
"grad_norm": 0.2532840073108673,
"learning_rate": 5.534209601537407e-05,
"loss": 0.0157,
"step": 6610
},
{
"epoch": 3.329979879275654,
"grad_norm": 0.216691792011261,
"learning_rate": 5.50446711701913e-05,
"loss": 0.0195,
"step": 6620
},
{
"epoch": 3.335010060362173,
"grad_norm": 0.1876860409975052,
"learning_rate": 5.474774390229129e-05,
"loss": 0.0152,
"step": 6630
},
{
"epoch": 3.3400402414486923,
"grad_norm": 0.11754748225212097,
"learning_rate": 5.4451317498142365e-05,
"loss": 0.0159,
"step": 6640
},
{
"epoch": 3.345070422535211,
"grad_norm": 0.5310174822807312,
"learning_rate": 5.4155395238669185e-05,
"loss": 0.0155,
"step": 6650
},
{
"epoch": 3.3501006036217302,
"grad_norm": 0.3709560036659241,
"learning_rate": 5.385998039921627e-05,
"loss": 0.0194,
"step": 6660
},
{
"epoch": 3.3551307847082494,
"grad_norm": 0.25374695658683777,
"learning_rate": 5.3565076249512034e-05,
"loss": 0.0184,
"step": 6670
},
{
"epoch": 3.3601609657947686,
"grad_norm": 0.2403491884469986,
"learning_rate": 5.3270686053632323e-05,
"loss": 0.0153,
"step": 6680
},
{
"epoch": 3.365191146881288,
"grad_norm": 0.23633185029029846,
"learning_rate": 5.29768130699645e-05,
"loss": 0.0161,
"step": 6690
},
{
"epoch": 3.3702213279678066,
"grad_norm": 0.3900575637817383,
"learning_rate": 5.268346055117129e-05,
"loss": 0.0258,
"step": 6700
},
{
"epoch": 3.375251509054326,
"grad_norm": 0.141703262925148,
"learning_rate": 5.239063174415466e-05,
"loss": 0.0121,
"step": 6710
},
{
"epoch": 3.380281690140845,
"grad_norm": 0.17671038210391998,
"learning_rate": 5.209832989002015e-05,
"loss": 0.0149,
"step": 6720
},
{
"epoch": 3.385311871227364,
"grad_norm": 0.24591059982776642,
"learning_rate": 5.18065582240407e-05,
"loss": 0.0234,
"step": 6730
},
{
"epoch": 3.3903420523138834,
"grad_norm": 0.26220864057540894,
"learning_rate": 5.151531997562116e-05,
"loss": 0.0255,
"step": 6740
},
{
"epoch": 3.395372233400402,
"grad_norm": 0.29695531725883484,
"learning_rate": 5.122461836826218e-05,
"loss": 0.019,
"step": 6750
},
{
"epoch": 3.4004024144869214,
"grad_norm": 0.12897849082946777,
"learning_rate": 5.0934456619524896e-05,
"loss": 0.0191,
"step": 6760
},
{
"epoch": 3.4054325955734406,
"grad_norm": 0.16902518272399902,
"learning_rate": 5.064483794099508e-05,
"loss": 0.0207,
"step": 6770
},
{
"epoch": 3.41046277665996,
"grad_norm": 0.20852209627628326,
"learning_rate": 5.0355765538247636e-05,
"loss": 0.0146,
"step": 6780
},
{
"epoch": 3.415492957746479,
"grad_norm": 0.16347025334835052,
"learning_rate": 5.006724261081118e-05,
"loss": 0.0186,
"step": 6790
},
{
"epoch": 3.4205231388329977,
"grad_norm": 0.18849849700927734,
"learning_rate": 4.977927235213259e-05,
"loss": 0.0217,
"step": 6800
},
{
"epoch": 3.425553319919517,
"grad_norm": 0.30719512701034546,
"learning_rate": 4.9491857949541696e-05,
"loss": 0.0176,
"step": 6810
},
{
"epoch": 3.430583501006036,
"grad_norm": 0.21589145064353943,
"learning_rate": 4.9205002584215855e-05,
"loss": 0.015,
"step": 6820
},
{
"epoch": 3.4356136820925554,
"grad_norm": 0.23759104311466217,
"learning_rate": 4.891870943114496e-05,
"loss": 0.0198,
"step": 6830
},
{
"epoch": 3.4406438631790746,
"grad_norm": 0.26880696415901184,
"learning_rate": 4.86329816590962e-05,
"loss": 0.0142,
"step": 6840
},
{
"epoch": 3.4456740442655933,
"grad_norm": 0.192754328250885,
"learning_rate": 4.8347822430578856e-05,
"loss": 0.0166,
"step": 6850
},
{
"epoch": 3.4507042253521125,
"grad_norm": 0.12232445925474167,
"learning_rate": 4.80632349018096e-05,
"loss": 0.0175,
"step": 6860
},
{
"epoch": 3.4557344064386317,
"grad_norm": 0.1528378129005432,
"learning_rate": 4.777922222267721e-05,
"loss": 0.0134,
"step": 6870
},
{
"epoch": 3.460764587525151,
"grad_norm": 0.37869587540626526,
"learning_rate": 4.749578753670799e-05,
"loss": 0.0209,
"step": 6880
},
{
"epoch": 3.46579476861167,
"grad_norm": 0.24779389798641205,
"learning_rate": 4.721293398103086e-05,
"loss": 0.0167,
"step": 6890
},
{
"epoch": 3.470824949698189,
"grad_norm": 0.1252845823764801,
"learning_rate": 4.6930664686342526e-05,
"loss": 0.0118,
"step": 6900
},
{
"epoch": 3.475855130784708,
"grad_norm": 0.28004470467567444,
"learning_rate": 4.664898277687313e-05,
"loss": 0.0229,
"step": 6910
},
{
"epoch": 3.4808853118712273,
"grad_norm": 0.1652926206588745,
"learning_rate": 4.636789137035129e-05,
"loss": 0.0201,
"step": 6920
},
{
"epoch": 3.4859154929577465,
"grad_norm": 0.12315156310796738,
"learning_rate": 4.6087393577969926e-05,
"loss": 0.0133,
"step": 6930
},
{
"epoch": 3.4909456740442657,
"grad_norm": 0.2795407772064209,
"learning_rate": 4.580749250435158e-05,
"loss": 0.0148,
"step": 6940
},
{
"epoch": 3.4959758551307845,
"grad_norm": 0.25727009773254395,
"learning_rate": 4.5528191247514226e-05,
"loss": 0.0161,
"step": 6950
},
{
"epoch": 3.5010060362173037,
"grad_norm": 0.2040313482284546,
"learning_rate": 4.524949289883692e-05,
"loss": 0.0169,
"step": 6960
},
{
"epoch": 3.506036217303823,
"grad_norm": 0.21510617434978485,
"learning_rate": 4.497140054302548e-05,
"loss": 0.019,
"step": 6970
},
{
"epoch": 3.511066398390342,
"grad_norm": 0.22986271977424622,
"learning_rate": 4.469391725807854e-05,
"loss": 0.0177,
"step": 6980
},
{
"epoch": 3.5160965794768613,
"grad_norm": 0.1664620190858841,
"learning_rate": 4.4417046115253304e-05,
"loss": 0.0207,
"step": 6990
},
{
"epoch": 3.52112676056338,
"grad_norm": 0.24035824835300446,
"learning_rate": 4.414079017903166e-05,
"loss": 0.021,
"step": 7000
},
{
"epoch": 3.5261569416498992,
"grad_norm": 0.2778620719909668,
"learning_rate": 4.386515250708627e-05,
"loss": 0.0179,
"step": 7010
},
{
"epoch": 3.5311871227364184,
"grad_norm": 0.2031363546848297,
"learning_rate": 4.3590136150246555e-05,
"loss": 0.0151,
"step": 7020
},
{
"epoch": 3.5362173038229376,
"grad_norm": 0.29513317346572876,
"learning_rate": 4.3315744152465276e-05,
"loss": 0.0166,
"step": 7030
},
{
"epoch": 3.541247484909457,
"grad_norm": 0.2097848355770111,
"learning_rate": 4.3041979550784395e-05,
"loss": 0.0142,
"step": 7040
},
{
"epoch": 3.5462776659959756,
"grad_norm": 0.21217374503612518,
"learning_rate": 4.276884537530187e-05,
"loss": 0.0131,
"step": 7050
},
{
"epoch": 3.551307847082495,
"grad_norm": 0.18939641118049622,
"learning_rate": 4.2496344649137776e-05,
"loss": 0.0166,
"step": 7060
},
{
"epoch": 3.556338028169014,
"grad_norm": 0.17117512226104736,
"learning_rate": 4.222448038840113e-05,
"loss": 0.0187,
"step": 7070
},
{
"epoch": 3.561368209255533,
"grad_norm": 0.15176957845687866,
"learning_rate": 4.1953255602156394e-05,
"loss": 0.0193,
"step": 7080
},
{
"epoch": 3.5663983903420524,
"grad_norm": 0.3019493818283081,
"learning_rate": 4.168267329239002e-05,
"loss": 0.0171,
"step": 7090
},
{
"epoch": 3.571428571428571,
"grad_norm": 0.18399718403816223,
"learning_rate": 4.141273645397754e-05,
"loss": 0.018,
"step": 7100
},
{
"epoch": 3.5764587525150904,
"grad_norm": 0.16979122161865234,
"learning_rate": 4.114344807465007e-05,
"loss": 0.0209,
"step": 7110
},
{
"epoch": 3.5814889336016096,
"grad_norm": 0.2553984820842743,
"learning_rate": 4.087481113496159e-05,
"loss": 0.0123,
"step": 7120
},
{
"epoch": 3.586519114688129,
"grad_norm": 0.20643579959869385,
"learning_rate": 4.060682860825559e-05,
"loss": 0.0159,
"step": 7130
},
{
"epoch": 3.591549295774648,
"grad_norm": 0.20078648626804352,
"learning_rate": 4.033950346063248e-05,
"loss": 0.0142,
"step": 7140
},
{
"epoch": 3.5965794768611667,
"grad_norm": 0.13958579301834106,
"learning_rate": 4.007283865091662e-05,
"loss": 0.0153,
"step": 7150
},
{
"epoch": 3.6016096579476864,
"grad_norm": 0.21849416196346283,
"learning_rate": 3.98068371306235e-05,
"loss": 0.022,
"step": 7160
},
{
"epoch": 3.606639839034205,
"grad_norm": 0.2273043692111969,
"learning_rate": 3.954150184392723e-05,
"loss": 0.0141,
"step": 7170
},
{
"epoch": 3.6116700201207244,
"grad_norm": 0.18715119361877441,
"learning_rate": 3.927683572762778e-05,
"loss": 0.0149,
"step": 7180
},
{
"epoch": 3.6167002012072436,
"grad_norm": 0.27733615040779114,
"learning_rate": 3.9012841711118677e-05,
"loss": 0.0177,
"step": 7190
},
{
"epoch": 3.6217303822937623,
"grad_norm": 0.22101663053035736,
"learning_rate": 3.874952271635444e-05,
"loss": 0.0177,
"step": 7200
},
{
"epoch": 3.626760563380282,
"grad_norm": 0.21932683885097504,
"learning_rate": 3.848688165781819e-05,
"loss": 0.0126,
"step": 7210
},
{
"epoch": 3.6317907444668007,
"grad_norm": 0.18129754066467285,
"learning_rate": 3.82249214424896e-05,
"loss": 0.0133,
"step": 7220
},
{
"epoch": 3.63682092555332,
"grad_norm": 0.2246370166540146,
"learning_rate": 3.796364496981247e-05,
"loss": 0.0148,
"step": 7230
},
{
"epoch": 3.641851106639839,
"grad_norm": 0.22842873632907867,
"learning_rate": 3.7703055131662854e-05,
"loss": 0.0129,
"step": 7240
},
{
"epoch": 3.646881287726358,
"grad_norm": 0.2930648922920227,
"learning_rate": 3.744315481231694e-05,
"loss": 0.0167,
"step": 7250
},
{
"epoch": 3.6519114688128775,
"grad_norm": 0.17970743775367737,
"learning_rate": 3.7183946888419066e-05,
"loss": 0.0135,
"step": 7260
},
{
"epoch": 3.6569416498993963,
"grad_norm": 0.342587411403656,
"learning_rate": 3.692543422895004e-05,
"loss": 0.0183,
"step": 7270
},
{
"epoch": 3.6619718309859155,
"grad_norm": 0.17181113362312317,
"learning_rate": 3.6667619695195285e-05,
"loss": 0.0112,
"step": 7280
},
{
"epoch": 3.6670020120724347,
"grad_norm": 0.14279602468013763,
"learning_rate": 3.6410506140713216e-05,
"loss": 0.0139,
"step": 7290
},
{
"epoch": 3.6720321931589535,
"grad_norm": 0.15404970943927765,
"learning_rate": 3.615409641130351e-05,
"loss": 0.0197,
"step": 7300
},
{
"epoch": 3.677062374245473,
"grad_norm": 0.21196570992469788,
"learning_rate": 3.589839334497587e-05,
"loss": 0.0114,
"step": 7310
},
{
"epoch": 3.682092555331992,
"grad_norm": 0.14234349131584167,
"learning_rate": 3.564339977191834e-05,
"loss": 0.0122,
"step": 7320
},
{
"epoch": 3.687122736418511,
"grad_norm": 0.3660185933113098,
"learning_rate": 3.538911851446619e-05,
"loss": 0.0153,
"step": 7330
},
{
"epoch": 3.6921529175050303,
"grad_norm": 0.20586389303207397,
"learning_rate": 3.5135552387070636e-05,
"loss": 0.0167,
"step": 7340
},
{
"epoch": 3.697183098591549,
"grad_norm": 0.19840209186077118,
"learning_rate": 3.48827041962675e-05,
"loss": 0.0131,
"step": 7350
},
{
"epoch": 3.7022132796780687,
"grad_norm": 0.3007209897041321,
"learning_rate": 3.463057674064646e-05,
"loss": 0.0223,
"step": 7360
},
{
"epoch": 3.7072434607645874,
"grad_norm": 0.20882894098758698,
"learning_rate": 3.437917281081975e-05,
"loss": 0.0204,
"step": 7370
},
{
"epoch": 3.7122736418511066,
"grad_norm": 0.19728295505046844,
"learning_rate": 3.412849518939155e-05,
"loss": 0.018,
"step": 7380
},
{
"epoch": 3.717303822937626,
"grad_norm": 0.2532462179660797,
"learning_rate": 3.387854665092709e-05,
"loss": 0.0161,
"step": 7390
},
{
"epoch": 3.7223340040241446,
"grad_norm": 0.1997775435447693,
"learning_rate": 3.3629329961921765e-05,
"loss": 0.0112,
"step": 7400
},
{
"epoch": 3.7273641851106643,
"grad_norm": 0.17711062729358673,
"learning_rate": 3.338084788077085e-05,
"loss": 0.0156,
"step": 7410
},
{
"epoch": 3.732394366197183,
"grad_norm": 0.1401996910572052,
"learning_rate": 3.313310315773864e-05,
"loss": 0.0281,
"step": 7420
},
{
"epoch": 3.737424547283702,
"grad_norm": 0.1834188550710678,
"learning_rate": 3.288609853492827e-05,
"loss": 0.0162,
"step": 7430
},
{
"epoch": 3.7424547283702214,
"grad_norm": 0.19655165076255798,
"learning_rate": 3.2639836746251216e-05,
"loss": 0.0124,
"step": 7440
},
{
"epoch": 3.74748490945674,
"grad_norm": 0.17778749763965607,
"learning_rate": 3.2394320517397015e-05,
"loss": 0.0153,
"step": 7450
},
{
"epoch": 3.75251509054326,
"grad_norm": 0.16552262008190155,
"learning_rate": 3.214955256580327e-05,
"loss": 0.017,
"step": 7460
},
{
"epoch": 3.7575452716297786,
"grad_norm": 0.19796250760555267,
"learning_rate": 3.1905535600625314e-05,
"loss": 0.0182,
"step": 7470
},
{
"epoch": 3.762575452716298,
"grad_norm": 0.19010251760482788,
"learning_rate": 3.166227232270651e-05,
"loss": 0.0182,
"step": 7480
},
{
"epoch": 3.767605633802817,
"grad_norm": 0.21966631710529327,
"learning_rate": 3.141976542454806e-05,
"loss": 0.018,
"step": 7490
},
{
"epoch": 3.7726358148893357,
"grad_norm": 0.14321380853652954,
"learning_rate": 3.117801759027959e-05,
"loss": 0.0139,
"step": 7500
},
{
"epoch": 3.7776659959758554,
"grad_norm": 0.22963641583919525,
"learning_rate": 3.093703149562892e-05,
"loss": 0.0112,
"step": 7510
},
{
"epoch": 3.782696177062374,
"grad_norm": 0.15116992592811584,
"learning_rate": 3.069680980789294e-05,
"loss": 0.0193,
"step": 7520
},
{
"epoch": 3.7877263581488934,
"grad_norm": 0.14807961881160736,
"learning_rate": 3.0457355185907877e-05,
"loss": 0.0222,
"step": 7530
},
{
"epoch": 3.7927565392354126,
"grad_norm": 0.19666004180908203,
"learning_rate": 3.0218670280019745e-05,
"loss": 0.0138,
"step": 7540
},
{
"epoch": 3.7977867203219313,
"grad_norm": 0.22170531749725342,
"learning_rate": 2.9980757732055277e-05,
"loss": 0.0185,
"step": 7550
},
{
"epoch": 3.802816901408451,
"grad_norm": 0.1371658444404602,
"learning_rate": 2.974362017529242e-05,
"loss": 0.0135,
"step": 7560
},
{
"epoch": 3.8078470824949697,
"grad_norm": 0.1677546203136444,
"learning_rate": 2.9507260234431444e-05,
"loss": 0.0096,
"step": 7570
},
{
"epoch": 3.812877263581489,
"grad_norm": 0.2982484996318817,
"learning_rate": 2.9271680525565724e-05,
"loss": 0.0155,
"step": 7580
},
{
"epoch": 3.817907444668008,
"grad_norm": 0.23418201506137848,
"learning_rate": 2.9036883656152734e-05,
"loss": 0.0151,
"step": 7590
},
{
"epoch": 3.822937625754527,
"grad_norm": 0.30079197883605957,
"learning_rate": 2.8802872224985434e-05,
"loss": 0.0167,
"step": 7600
},
{
"epoch": 3.8279678068410465,
"grad_norm": 0.16451403498649597,
"learning_rate": 2.8569648822163185e-05,
"loss": 0.0113,
"step": 7610
},
{
"epoch": 3.8329979879275653,
"grad_norm": 0.28625065088272095,
"learning_rate": 2.8337216029063395e-05,
"loss": 0.0146,
"step": 7620
},
{
"epoch": 3.8380281690140845,
"grad_norm": 0.22233416140079498,
"learning_rate": 2.810557641831266e-05,
"loss": 0.0201,
"step": 7630
},
{
"epoch": 3.8430583501006037,
"grad_norm": 0.10642603039741516,
"learning_rate": 2.787473255375853e-05,
"loss": 0.0125,
"step": 7640
},
{
"epoch": 3.8480885311871225,
"grad_norm": 0.2190830409526825,
"learning_rate": 2.7644686990441027e-05,
"loss": 0.0169,
"step": 7650
},
{
"epoch": 3.853118712273642,
"grad_norm": 0.1361585557460785,
"learning_rate": 2.7415442274564273e-05,
"loss": 0.0144,
"step": 7660
},
{
"epoch": 3.858148893360161,
"grad_norm": 0.27642127871513367,
"learning_rate": 2.718700094346851e-05,
"loss": 0.0131,
"step": 7670
},
{
"epoch": 3.86317907444668,
"grad_norm": 0.3688669800758362,
"learning_rate": 2.6959365525601822e-05,
"loss": 0.0142,
"step": 7680
},
{
"epoch": 3.8682092555331993,
"grad_norm": 0.06550594419240952,
"learning_rate": 2.6732538540492292e-05,
"loss": 0.0151,
"step": 7690
},
{
"epoch": 3.873239436619718,
"grad_norm": 0.2921280264854431,
"learning_rate": 2.6506522498720065e-05,
"loss": 0.0107,
"step": 7700
},
{
"epoch": 3.8782696177062377,
"grad_norm": 0.2402195930480957,
"learning_rate": 2.6281319901889488e-05,
"loss": 0.0162,
"step": 7710
},
{
"epoch": 3.8832997987927564,
"grad_norm": 0.15326948463916779,
"learning_rate": 2.6056933242601544e-05,
"loss": 0.0171,
"step": 7720
},
{
"epoch": 3.8883299798792756,
"grad_norm": 0.15069030225276947,
"learning_rate": 2.5833365004426215e-05,
"loss": 0.0137,
"step": 7730
},
{
"epoch": 3.893360160965795,
"grad_norm": 0.2580581307411194,
"learning_rate": 2.561061766187496e-05,
"loss": 0.0159,
"step": 7740
},
{
"epoch": 3.8983903420523136,
"grad_norm": 0.20609134435653687,
"learning_rate": 2.538869368037332e-05,
"loss": 0.0165,
"step": 7750
},
{
"epoch": 3.9034205231388333,
"grad_norm": 0.12185626477003098,
"learning_rate": 2.5167595516233722e-05,
"loss": 0.0148,
"step": 7760
},
{
"epoch": 3.908450704225352,
"grad_norm": 0.14162567257881165,
"learning_rate": 2.4947325616628225e-05,
"loss": 0.0139,
"step": 7770
},
{
"epoch": 3.913480885311871,
"grad_norm": 0.14534728229045868,
"learning_rate": 2.4727886419561374e-05,
"loss": 0.0129,
"step": 7780
},
{
"epoch": 3.9185110663983904,
"grad_norm": 0.1677662879228592,
"learning_rate": 2.450928035384339e-05,
"loss": 0.0177,
"step": 7790
},
{
"epoch": 3.9235412474849096,
"grad_norm": 0.35635554790496826,
"learning_rate": 2.4291509839063042e-05,
"loss": 0.0138,
"step": 7800
},
{
"epoch": 3.928571428571429,
"grad_norm": 0.1618734896183014,
"learning_rate": 2.407457728556115e-05,
"loss": 0.0133,
"step": 7810
},
{
"epoch": 3.9336016096579476,
"grad_norm": 0.1893433928489685,
"learning_rate": 2.385848509440364e-05,
"loss": 0.0152,
"step": 7820
},
{
"epoch": 3.938631790744467,
"grad_norm": 0.2308843731880188,
"learning_rate": 2.3643235657355145e-05,
"loss": 0.0128,
"step": 7830
},
{
"epoch": 3.943661971830986,
"grad_norm": 0.21923568844795227,
"learning_rate": 2.342883135685253e-05,
"loss": 0.0147,
"step": 7840
},
{
"epoch": 3.948692152917505,
"grad_norm": 0.24298080801963806,
"learning_rate": 2.321527456597833e-05,
"loss": 0.0177,
"step": 7850
},
{
"epoch": 3.9537223340040244,
"grad_norm": 0.14858295023441315,
"learning_rate": 2.300256764843477e-05,
"loss": 0.0154,
"step": 7860
},
{
"epoch": 3.958752515090543,
"grad_norm": 0.21552342176437378,
"learning_rate": 2.2790712958517324e-05,
"loss": 0.013,
"step": 7870
},
{
"epoch": 3.9637826961770624,
"grad_norm": 0.2514561712741852,
"learning_rate": 2.2579712841088873e-05,
"loss": 0.0153,
"step": 7880
},
{
"epoch": 3.9688128772635816,
"grad_norm": 0.1734912097454071,
"learning_rate": 2.23695696315537e-05,
"loss": 0.0198,
"step": 7890
},
{
"epoch": 3.9738430583501008,
"grad_norm": 0.2863246500492096,
"learning_rate": 2.216028565583148e-05,
"loss": 0.016,
"step": 7900
},
{
"epoch": 3.97887323943662,
"grad_norm": 0.2181919515132904,
"learning_rate": 2.1951863230331793e-05,
"loss": 0.012,
"step": 7910
},
{
"epoch": 3.9839034205231387,
"grad_norm": 0.2490726262331009,
"learning_rate": 2.174430466192826e-05,
"loss": 0.0121,
"step": 7920
},
{
"epoch": 3.988933601609658,
"grad_norm": 0.10584773123264313,
"learning_rate": 2.153761224793317e-05,
"loss": 0.0144,
"step": 7930
},
{
"epoch": 3.993963782696177,
"grad_norm": 0.16053903102874756,
"learning_rate": 2.1331788276072007e-05,
"loss": 0.0169,
"step": 7940
},
{
"epoch": 3.9989939637826963,
"grad_norm": 0.11051679402589798,
"learning_rate": 2.1126835024458003e-05,
"loss": 0.0094,
"step": 7950
},
{
"epoch": 4.0040241448692155,
"grad_norm": 0.2628016173839569,
"learning_rate": 2.0922754761567143e-05,
"loss": 0.0132,
"step": 7960
},
{
"epoch": 4.009054325955734,
"grad_norm": 0.14394591748714447,
"learning_rate": 2.0719549746212897e-05,
"loss": 0.0148,
"step": 7970
},
{
"epoch": 4.014084507042254,
"grad_norm": 0.11262981593608856,
"learning_rate": 2.0517222227521304e-05,
"loss": 0.0096,
"step": 7980
},
{
"epoch": 4.019114688128773,
"grad_norm": 0.09093613922595978,
"learning_rate": 2.0315774444905965e-05,
"loss": 0.0128,
"step": 7990
},
{
"epoch": 4.0241448692152915,
"grad_norm": 0.1167258694767952,
"learning_rate": 2.0115208628043436e-05,
"loss": 0.0167,
"step": 8000
},
{
"epoch": 4.029175050301811,
"grad_norm": 0.1440437287092209,
"learning_rate": 1.9915526996848333e-05,
"loss": 0.0136,
"step": 8010
},
{
"epoch": 4.03420523138833,
"grad_norm": 0.23001736402511597,
"learning_rate": 1.971673176144896e-05,
"loss": 0.0133,
"step": 8020
},
{
"epoch": 4.0392354124748495,
"grad_norm": 0.18372376263141632,
"learning_rate": 1.9518825122162766e-05,
"loss": 0.0121,
"step": 8030
},
{
"epoch": 4.044265593561368,
"grad_norm": 0.16671916842460632,
"learning_rate": 1.932180926947189e-05,
"loss": 0.0124,
"step": 8040
},
{
"epoch": 4.049295774647887,
"grad_norm": 0.22989486157894135,
"learning_rate": 1.912568638399915e-05,
"loss": 0.0142,
"step": 8050
},
{
"epoch": 4.054325955734407,
"grad_norm": 0.28540411591529846,
"learning_rate": 1.893045863648364e-05,
"loss": 0.0162,
"step": 8060
},
{
"epoch": 4.059356136820925,
"grad_norm": 0.21366238594055176,
"learning_rate": 1.873612818775692e-05,
"loss": 0.0139,
"step": 8070
},
{
"epoch": 4.064386317907445,
"grad_norm": 0.08002448827028275,
"learning_rate": 1.8542697188719005e-05,
"loss": 0.0168,
"step": 8080
},
{
"epoch": 4.069416498993964,
"grad_norm": 0.17863331735134125,
"learning_rate": 1.83501677803145e-05,
"loss": 0.0098,
"step": 8090
},
{
"epoch": 4.074446680080483,
"grad_norm": 0.2530530095100403,
"learning_rate": 1.815854209350908e-05,
"loss": 0.013,
"step": 8100
},
{
"epoch": 4.079476861167002,
"grad_norm": 0.18760378658771515,
"learning_rate": 1.7967822249265677e-05,
"loss": 0.0166,
"step": 8110
},
{
"epoch": 4.084507042253521,
"grad_norm": 0.09926089644432068,
"learning_rate": 1.777801035852119e-05,
"loss": 0.018,
"step": 8120
},
{
"epoch": 4.089537223340041,
"grad_norm": 0.24970552325248718,
"learning_rate": 1.758910852216309e-05,
"loss": 0.0157,
"step": 8130
},
{
"epoch": 4.094567404426559,
"grad_norm": 0.12321308255195618,
"learning_rate": 1.7401118831006004e-05,
"loss": 0.0115,
"step": 8140
},
{
"epoch": 4.099597585513078,
"grad_norm": 0.1537485420703888,
"learning_rate": 1.721404336576884e-05,
"loss": 0.0092,
"step": 8150
},
{
"epoch": 4.104627766599598,
"grad_norm": 0.13459086418151855,
"learning_rate": 1.702788419705148e-05,
"loss": 0.0116,
"step": 8160
},
{
"epoch": 4.109657947686117,
"grad_norm": 0.3569525480270386,
"learning_rate": 1.684264338531214e-05,
"loss": 0.0155,
"step": 8170
},
{
"epoch": 4.114688128772636,
"grad_norm": 0.11241878569126129,
"learning_rate": 1.6658322980844298e-05,
"loss": 0.0089,
"step": 8180
},
{
"epoch": 4.119718309859155,
"grad_norm": 0.31290173530578613,
"learning_rate": 1.6474925023754174e-05,
"loss": 0.0148,
"step": 8190
},
{
"epoch": 4.124748490945674,
"grad_norm": 0.14965899288654327,
"learning_rate": 1.6292451543938124e-05,
"loss": 0.0131,
"step": 8200
},
{
"epoch": 4.129778672032193,
"grad_norm": 0.29743674397468567,
"learning_rate": 1.6110904561060126e-05,
"loss": 0.016,
"step": 8210
},
{
"epoch": 4.134808853118712,
"grad_norm": 0.26381608843803406,
"learning_rate": 1.5930286084529457e-05,
"loss": 0.0127,
"step": 8220
},
{
"epoch": 4.139839034205232,
"grad_norm": 0.3434705436229706,
"learning_rate": 1.57505981134784e-05,
"loss": 0.0138,
"step": 8230
},
{
"epoch": 4.144869215291751,
"grad_norm": 0.3142394423484802,
"learning_rate": 1.557184263674024e-05,
"loss": 0.0176,
"step": 8240
},
{
"epoch": 4.149899396378269,
"grad_norm": 0.17862974107265472,
"learning_rate": 1.5394021632827093e-05,
"loss": 0.0123,
"step": 8250
},
{
"epoch": 4.154929577464789,
"grad_norm": 0.2902679145336151,
"learning_rate": 1.5217137069908128e-05,
"loss": 0.0153,
"step": 8260
},
{
"epoch": 4.159959758551308,
"grad_norm": 0.28409862518310547,
"learning_rate": 1.5041190905787772e-05,
"loss": 0.0117,
"step": 8270
},
{
"epoch": 4.164989939637827,
"grad_norm": 0.32609495520591736,
"learning_rate": 1.4866185087883933e-05,
"loss": 0.0165,
"step": 8280
},
{
"epoch": 4.170020120724346,
"grad_norm": 0.3060837686061859,
"learning_rate": 1.4692121553206595e-05,
"loss": 0.0153,
"step": 8290
},
{
"epoch": 4.175050301810865,
"grad_norm": 0.25559473037719727,
"learning_rate": 1.4519002228336232e-05,
"loss": 0.0104,
"step": 8300
},
{
"epoch": 4.1800804828973845,
"grad_norm": 0.12555906176567078,
"learning_rate": 1.4346829029402654e-05,
"loss": 0.0086,
"step": 8310
},
{
"epoch": 4.185110663983903,
"grad_norm": 0.12174857407808304,
"learning_rate": 1.4175603862063591e-05,
"loss": 0.0088,
"step": 8320
},
{
"epoch": 4.190140845070423,
"grad_norm": 0.11725517362356186,
"learning_rate": 1.4005328621483794e-05,
"loss": 0.0152,
"step": 8330
},
{
"epoch": 4.195171026156942,
"grad_norm": 0.20686453580856323,
"learning_rate": 1.3836005192313994e-05,
"loss": 0.0159,
"step": 8340
},
{
"epoch": 4.2002012072434605,
"grad_norm": 0.2010798454284668,
"learning_rate": 1.3667635448669913e-05,
"loss": 0.0136,
"step": 8350
},
{
"epoch": 4.20523138832998,
"grad_norm": 0.18625499308109283,
"learning_rate": 1.3500221254111777e-05,
"loss": 0.0222,
"step": 8360
},
{
"epoch": 4.210261569416499,
"grad_norm": 0.2409621626138687,
"learning_rate": 1.3333764461623421e-05,
"loss": 0.0234,
"step": 8370
},
{
"epoch": 4.2152917505030185,
"grad_norm": 0.12151456624269485,
"learning_rate": 1.3168266913591976e-05,
"loss": 0.0152,
"step": 8380
},
{
"epoch": 4.220321931589537,
"grad_norm": 0.2698768675327301,
"learning_rate": 1.3003730441787399e-05,
"loss": 0.0104,
"step": 8390
},
{
"epoch": 4.225352112676056,
"grad_norm": 0.21013912558555603,
"learning_rate": 1.2840156867342179e-05,
"loss": 0.0095,
"step": 8400
},
{
"epoch": 4.230382293762576,
"grad_norm": 0.10292834788560867,
"learning_rate": 1.2677548000731243e-05,
"loss": 0.0113,
"step": 8410
},
{
"epoch": 4.2354124748490944,
"grad_norm": 0.05743186175823212,
"learning_rate": 1.2515905641751824e-05,
"loss": 0.0105,
"step": 8420
},
{
"epoch": 4.240442655935614,
"grad_norm": 0.18373721837997437,
"learning_rate": 1.2355231579503645e-05,
"loss": 0.0156,
"step": 8430
},
{
"epoch": 4.245472837022133,
"grad_norm": 0.18463782966136932,
"learning_rate": 1.219552759236906e-05,
"loss": 0.0101,
"step": 8440
},
{
"epoch": 4.250503018108652,
"grad_norm": 0.13232889771461487,
"learning_rate": 1.2036795447993387e-05,
"loss": 0.0099,
"step": 8450
},
{
"epoch": 4.255533199195171,
"grad_norm": 0.1151013895869255,
"learning_rate": 1.1879036903265328e-05,
"loss": 0.0096,
"step": 8460
},
{
"epoch": 4.26056338028169,
"grad_norm": 0.1502464860677719,
"learning_rate": 1.1722253704297492e-05,
"loss": 0.0114,
"step": 8470
},
{
"epoch": 4.26559356136821,
"grad_norm": 0.2458263784646988,
"learning_rate": 1.1566447586407169e-05,
"loss": 0.017,
"step": 8480
},
{
"epoch": 4.270623742454728,
"grad_norm": 0.2860092222690582,
"learning_rate": 1.1411620274097013e-05,
"loss": 0.017,
"step": 8490
},
{
"epoch": 4.275653923541247,
"grad_norm": 0.14720211923122406,
"learning_rate": 1.1257773481036049e-05,
"loss": 0.0135,
"step": 8500
},
{
"epoch": 4.280684104627767,
"grad_norm": 0.3655484616756439,
"learning_rate": 1.110490891004059e-05,
"loss": 0.0228,
"step": 8510
},
{
"epoch": 4.285714285714286,
"grad_norm": 0.08732811361551285,
"learning_rate": 1.0953028253055542e-05,
"loss": 0.0096,
"step": 8520
},
{
"epoch": 4.290744466800805,
"grad_norm": 0.14107802510261536,
"learning_rate": 1.0802133191135566e-05,
"loss": 0.0096,
"step": 8530
},
{
"epoch": 4.295774647887324,
"grad_norm": 0.18336671590805054,
"learning_rate": 1.0652225394426441e-05,
"loss": 0.0146,
"step": 8540
},
{
"epoch": 4.300804828973843,
"grad_norm": 0.1546245664358139,
"learning_rate": 1.0503306522146738e-05,
"loss": 0.0113,
"step": 8550
},
{
"epoch": 4.305835010060362,
"grad_norm": 0.1467217206954956,
"learning_rate": 1.0355378222569256e-05,
"loss": 0.0136,
"step": 8560
},
{
"epoch": 4.310865191146881,
"grad_norm": 0.1393778920173645,
"learning_rate": 1.0208442133002948e-05,
"loss": 0.0107,
"step": 8570
},
{
"epoch": 4.315895372233401,
"grad_norm": 0.3030548393726349,
"learning_rate": 1.0062499879774734e-05,
"loss": 0.0153,
"step": 8580
},
{
"epoch": 4.32092555331992,
"grad_norm": 0.30646151304244995,
"learning_rate": 9.917553078211417e-06,
"loss": 0.0117,
"step": 8590
},
{
"epoch": 4.325955734406438,
"grad_norm": 0.15872900187969208,
"learning_rate": 9.773603332621972e-06,
"loss": 0.0114,
"step": 8600
},
{
"epoch": 4.330985915492958,
"grad_norm": 0.2105736881494522,
"learning_rate": 9.630652236279625e-06,
"loss": 0.0108,
"step": 8610
},
{
"epoch": 4.336016096579477,
"grad_norm": 0.13897240161895752,
"learning_rate": 9.488701371404329e-06,
"loss": 0.0089,
"step": 8620
},
{
"epoch": 4.341046277665996,
"grad_norm": 0.2540101408958435,
"learning_rate": 9.347752309145241e-06,
"loss": 0.0212,
"step": 8630
},
{
"epoch": 4.346076458752515,
"grad_norm": 0.10785099118947983,
"learning_rate": 9.20780660956324e-06,
"loss": 0.0082,
"step": 8640
},
{
"epoch": 4.351106639839034,
"grad_norm": 0.10478243976831436,
"learning_rate": 9.068865821613803e-06,
"loss": 0.0131,
"step": 8650
},
{
"epoch": 4.3561368209255535,
"grad_norm": 0.12173474580049515,
"learning_rate": 8.930931483129667e-06,
"loss": 0.0107,
"step": 8660
},
{
"epoch": 4.361167002012072,
"grad_norm": 0.19661535322666168,
"learning_rate": 8.794005120804082e-06,
"loss": 0.0121,
"step": 8670
},
{
"epoch": 4.366197183098592,
"grad_norm": 0.1414082646369934,
"learning_rate": 8.658088250173624e-06,
"loss": 0.0106,
"step": 8680
},
{
"epoch": 4.371227364185111,
"grad_norm": 0.07380446791648865,
"learning_rate": 8.523182375601635e-06,
"loss": 0.0116,
"step": 8690
},
{
"epoch": 4.3762575452716295,
"grad_norm": 0.1488698124885559,
"learning_rate": 8.389288990261413e-06,
"loss": 0.0093,
"step": 8700
},
{
"epoch": 4.381287726358149,
"grad_norm": 0.17949314415454865,
"learning_rate": 8.256409576119827e-06,
"loss": 0.0099,
"step": 8710
},
{
"epoch": 4.386317907444668,
"grad_norm": 0.34648457169532776,
"learning_rate": 8.124545603920842e-06,
"loss": 0.0109,
"step": 8720
},
{
"epoch": 4.3913480885311875,
"grad_norm": 0.16411487758159637,
"learning_rate": 7.993698533169192e-06,
"loss": 0.0092,
"step": 8730
},
{
"epoch": 4.396378269617706,
"grad_norm": 0.20851458609104156,
"learning_rate": 7.863869812114366e-06,
"loss": 0.0125,
"step": 8740
},
{
"epoch": 4.401408450704225,
"grad_norm": 0.13391834497451782,
"learning_rate": 7.73506087773439e-06,
"loss": 0.0111,
"step": 8750
},
{
"epoch": 4.406438631790745,
"grad_norm": 0.2575322687625885,
"learning_rate": 7.60727315572013e-06,
"loss": 0.0137,
"step": 8760
},
{
"epoch": 4.4114688128772634,
"grad_norm": 0.1566155105829239,
"learning_rate": 7.480508060459346e-06,
"loss": 0.0121,
"step": 8770
},
{
"epoch": 4.416498993963783,
"grad_norm": 0.23036964237689972,
"learning_rate": 7.3547669950211005e-06,
"loss": 0.0189,
"step": 8780
},
{
"epoch": 4.421529175050302,
"grad_norm": 0.16328741610050201,
"learning_rate": 7.230051351140266e-06,
"loss": 0.0188,
"step": 8790
},
{
"epoch": 4.426559356136821,
"grad_norm": 0.45226040482521057,
"learning_rate": 7.106362509202036e-06,
"loss": 0.0175,
"step": 8800
},
{
"epoch": 4.43158953722334,
"grad_norm": 0.4067038297653198,
"learning_rate": 6.983701838226708e-06,
"loss": 0.0134,
"step": 8810
},
{
"epoch": 4.436619718309859,
"grad_norm": 0.1728140115737915,
"learning_rate": 6.86207069585455e-06,
"loss": 0.0096,
"step": 8820
},
{
"epoch": 4.441649899396379,
"grad_norm": 0.22685836255550385,
"learning_rate": 6.741470428330676e-06,
"loss": 0.0135,
"step": 8830
},
{
"epoch": 4.446680080482897,
"grad_norm": 0.18156301975250244,
"learning_rate": 6.621902370490274e-06,
"loss": 0.0144,
"step": 8840
},
{
"epoch": 4.451710261569416,
"grad_norm": 0.20519036054611206,
"learning_rate": 6.503367845743702e-06,
"loss": 0.015,
"step": 8850
},
{
"epoch": 4.456740442655936,
"grad_norm": 0.26189279556274414,
"learning_rate": 6.385868166061981e-06,
"loss": 0.011,
"step": 8860
},
{
"epoch": 4.461770623742455,
"grad_norm": 0.1770821064710617,
"learning_rate": 6.269404631962106e-06,
"loss": 0.0112,
"step": 8870
},
{
"epoch": 4.466800804828974,
"grad_norm": 0.31295013427734375,
"learning_rate": 6.153978532492821e-06,
"loss": 0.0186,
"step": 8880
},
{
"epoch": 4.471830985915493,
"grad_norm": 0.47270047664642334,
"learning_rate": 6.0395911452202355e-06,
"loss": 0.0149,
"step": 8890
},
{
"epoch": 4.476861167002012,
"grad_norm": 0.1958407759666443,
"learning_rate": 5.926243736213743e-06,
"loss": 0.0076,
"step": 8900
},
{
"epoch": 4.481891348088531,
"grad_norm": 0.14082856476306915,
"learning_rate": 5.813937560031979e-06,
"loss": 0.014,
"step": 8910
},
{
"epoch": 4.48692152917505,
"grad_norm": 0.2103276550769806,
"learning_rate": 5.702673859708896e-06,
"loss": 0.0121,
"step": 8920
},
{
"epoch": 4.49195171026157,
"grad_norm": 0.10313666611909866,
"learning_rate": 5.592453866740155e-06,
"loss": 0.0118,
"step": 8930
},
{
"epoch": 4.496981891348089,
"grad_norm": 0.23443832993507385,
"learning_rate": 5.48327880106927e-06,
"loss": 0.0123,
"step": 8940
},
{
"epoch": 4.502012072434607,
"grad_norm": 0.08273012936115265,
"learning_rate": 5.375149871074336e-06,
"loss": 0.0101,
"step": 8950
},
{
"epoch": 4.507042253521127,
"grad_norm": 0.13498452305793762,
"learning_rate": 5.268068273554483e-06,
"loss": 0.0142,
"step": 8960
},
{
"epoch": 4.512072434607646,
"grad_norm": 0.12951943278312683,
"learning_rate": 5.1620351937167076e-06,
"loss": 0.0111,
"step": 8970
},
{
"epoch": 4.517102615694165,
"grad_norm": 0.18177278339862823,
"learning_rate": 5.057051805162749e-06,
"loss": 0.0143,
"step": 8980
},
{
"epoch": 4.522132796780684,
"grad_norm": 0.23840029537677765,
"learning_rate": 4.953119269876061e-06,
"loss": 0.0121,
"step": 8990
},
{
"epoch": 4.527162977867203,
"grad_norm": 0.13470517098903656,
"learning_rate": 4.8502387382090345e-06,
"loss": 0.0136,
"step": 9000
},
{
"epoch": 4.5321931589537225,
"grad_norm": 0.1814277470111847,
"learning_rate": 4.748411348870141e-06,
"loss": 0.0167,
"step": 9010
},
{
"epoch": 4.537223340040241,
"grad_norm": 0.3026091158390045,
"learning_rate": 4.647638228911466e-06,
"loss": 0.0157,
"step": 9020
},
{
"epoch": 4.542253521126761,
"grad_norm": 0.13517017662525177,
"learning_rate": 4.547920493716118e-06,
"loss": 0.0093,
"step": 9030
},
{
"epoch": 4.54728370221328,
"grad_norm": 0.06449972838163376,
"learning_rate": 4.4492592469859486e-06,
"loss": 0.0101,
"step": 9040
},
{
"epoch": 4.5523138832997985,
"grad_norm": 0.24287287890911102,
"learning_rate": 4.3516555807293415e-06,
"loss": 0.015,
"step": 9050
},
{
"epoch": 4.557344064386318,
"grad_norm": 0.13659290969371796,
"learning_rate": 4.255110575249055e-06,
"loss": 0.0098,
"step": 9060
},
{
"epoch": 4.562374245472837,
"grad_norm": 0.258847177028656,
"learning_rate": 4.1596252991303655e-06,
"loss": 0.0109,
"step": 9070
},
{
"epoch": 4.5674044265593565,
"grad_norm": 0.2787615954875946,
"learning_rate": 4.065200809229163e-06,
"loss": 0.0116,
"step": 9080
},
{
"epoch": 4.572434607645875,
"grad_norm": 0.1802290678024292,
"learning_rate": 3.971838150660268e-06,
"loss": 0.0101,
"step": 9090
},
{
"epoch": 4.577464788732394,
"grad_norm": 0.22423778474330902,
"learning_rate": 3.879538356785917e-06,
"loss": 0.0103,
"step": 9100
},
{
"epoch": 4.582494969818914,
"grad_norm": 0.18749003112316132,
"learning_rate": 3.7883024492042286e-06,
"loss": 0.0084,
"step": 9110
},
{
"epoch": 4.5875251509054324,
"grad_norm": 0.08380598574876785,
"learning_rate": 3.698131437737995e-06,
"loss": 0.0092,
"step": 9120
},
{
"epoch": 4.592555331991952,
"grad_norm": 0.11922164261341095,
"learning_rate": 3.6090263204234363e-06,
"loss": 0.0129,
"step": 9130
},
{
"epoch": 4.597585513078471,
"grad_norm": 0.13963715732097626,
"learning_rate": 3.520988083499199e-06,
"loss": 0.0113,
"step": 9140
},
{
"epoch": 4.60261569416499,
"grad_norm": 0.15656189620494843,
"learning_rate": 3.434017701395431e-06,
"loss": 0.0109,
"step": 9150
},
{
"epoch": 4.607645875251509,
"grad_norm": 0.0764361172914505,
"learning_rate": 3.348116136722912e-06,
"loss": 0.0131,
"step": 9160
},
{
"epoch": 4.612676056338028,
"grad_norm": 0.23831793665885925,
"learning_rate": 3.2632843402625625e-06,
"loss": 0.0115,
"step": 9170
},
{
"epoch": 4.617706237424548,
"grad_norm": 0.19923029839992523,
"learning_rate": 3.1795232509547633e-06,
"loss": 0.0102,
"step": 9180
},
{
"epoch": 4.622736418511066,
"grad_norm": 0.1905340999364853,
"learning_rate": 3.096833795889076e-06,
"loss": 0.0104,
"step": 9190
},
{
"epoch": 4.627766599597585,
"grad_norm": 0.2911848723888397,
"learning_rate": 3.015216890293904e-06,
"loss": 0.0113,
"step": 9200
},
{
"epoch": 4.632796780684105,
"grad_norm": 0.19489365816116333,
"learning_rate": 2.9346734375264027e-06,
"loss": 0.0145,
"step": 9210
},
{
"epoch": 4.637826961770624,
"grad_norm": 0.10868648439645767,
"learning_rate": 2.8552043290624997e-06,
"loss": 0.0128,
"step": 9220
},
{
"epoch": 4.642857142857143,
"grad_norm": 0.21155446767807007,
"learning_rate": 2.7768104444869436e-06,
"loss": 0.0127,
"step": 9230
},
{
"epoch": 4.647887323943662,
"grad_norm": 0.20911335945129395,
"learning_rate": 2.6994926514836925e-06,
"loss": 0.0159,
"step": 9240
},
{
"epoch": 4.652917505030181,
"grad_norm": 0.20620648562908173,
"learning_rate": 2.6232518058261658e-06,
"loss": 0.0136,
"step": 9250
},
{
"epoch": 4.6579476861167,
"grad_norm": 0.0914982482790947,
"learning_rate": 2.5480887513679166e-06,
"loss": 0.0114,
"step": 9260
},
{
"epoch": 4.662977867203219,
"grad_norm": 0.09038899838924408,
"learning_rate": 2.4740043200332074e-06,
"loss": 0.0125,
"step": 9270
},
{
"epoch": 4.668008048289739,
"grad_norm": 0.13401928544044495,
"learning_rate": 2.400999331807796e-06,
"loss": 0.0155,
"step": 9280
},
{
"epoch": 4.673038229376258,
"grad_norm": 0.18889258801937103,
"learning_rate": 2.3290745947298966e-06,
"loss": 0.0098,
"step": 9290
},
{
"epoch": 4.678068410462776,
"grad_norm": 0.15076126158237457,
"learning_rate": 2.258230904881231e-06,
"loss": 0.0151,
"step": 9300
},
{
"epoch": 4.683098591549296,
"grad_norm": 0.17581748962402344,
"learning_rate": 2.1884690463781833e-06,
"loss": 0.0148,
"step": 9310
},
{
"epoch": 4.688128772635815,
"grad_norm": 0.08170731365680695,
"learning_rate": 2.1197897913632026e-06,
"loss": 0.0216,
"step": 9320
},
{
"epoch": 4.693158953722334,
"grad_norm": 0.19143046438694,
"learning_rate": 2.0521938999961243e-06,
"loss": 0.0166,
"step": 9330
},
{
"epoch": 4.698189134808853,
"grad_norm": 0.13908398151397705,
"learning_rate": 1.9856821204458864e-06,
"loss": 0.0121,
"step": 9340
},
{
"epoch": 4.703219315895372,
"grad_norm": 0.1692146509885788,
"learning_rate": 1.9202551888821807e-06,
"loss": 0.0073,
"step": 9350
},
{
"epoch": 4.7082494969818915,
"grad_norm": 0.11902793496847153,
"learning_rate": 1.855913829467315e-06,
"loss": 0.0098,
"step": 9360
},
{
"epoch": 4.71327967806841,
"grad_norm": 0.3413757383823395,
"learning_rate": 1.7926587543482088e-06,
"loss": 0.0225,
"step": 9370
},
{
"epoch": 4.71830985915493,
"grad_norm": 0.13018754124641418,
"learning_rate": 1.7304906636485097e-06,
"loss": 0.0097,
"step": 9380
},
{
"epoch": 4.723340040241449,
"grad_norm": 0.17999660968780518,
"learning_rate": 1.6694102454608118e-06,
"loss": 0.0174,
"step": 9390
},
{
"epoch": 4.7283702213279675,
"grad_norm": 0.1070287749171257,
"learning_rate": 1.6094181758390947e-06,
"loss": 0.0116,
"step": 9400
},
{
"epoch": 4.733400402414487,
"grad_norm": 0.18182510137557983,
"learning_rate": 1.5505151187912071e-06,
"loss": 0.0109,
"step": 9410
},
{
"epoch": 4.738430583501006,
"grad_norm": 0.1901055872440338,
"learning_rate": 1.4927017262715059e-06,
"loss": 0.0093,
"step": 9420
},
{
"epoch": 4.7434607645875255,
"grad_norm": 0.1983153074979782,
"learning_rate": 1.435978638173685e-06,
"loss": 0.0153,
"step": 9430
},
{
"epoch": 4.748490945674044,
"grad_norm": 0.1601441502571106,
"learning_rate": 1.3803464823236356e-06,
"loss": 0.0101,
"step": 9440
},
{
"epoch": 4.753521126760563,
"grad_norm": 0.1466078907251358,
"learning_rate": 1.325805874472552e-06,
"loss": 0.0183,
"step": 9450
},
{
"epoch": 4.758551307847083,
"grad_norm": 0.24373824894428253,
"learning_rate": 1.272357418290082e-06,
"loss": 0.008,
"step": 9460
},
{
"epoch": 4.7635814889336014,
"grad_norm": 0.08891261368989944,
"learning_rate": 1.2200017053576318e-06,
"loss": 0.0106,
"step": 9470
},
{
"epoch": 4.768611670020121,
"grad_norm": 0.17067329585552216,
"learning_rate": 1.1687393151618931e-06,
"loss": 0.0096,
"step": 9480
},
{
"epoch": 4.77364185110664,
"grad_norm": 0.1952274590730667,
"learning_rate": 1.1185708150883268e-06,
"loss": 0.0122,
"step": 9490
},
{
"epoch": 4.778672032193159,
"grad_norm": 0.18577119708061218,
"learning_rate": 1.0694967604149563e-06,
"loss": 0.0132,
"step": 9500
},
{
"epoch": 4.783702213279678,
"grad_norm": 0.19541311264038086,
"learning_rate": 1.0215176943061955e-06,
"loss": 0.0135,
"step": 9510
},
{
"epoch": 4.788732394366197,
"grad_norm": 0.14900928735733032,
"learning_rate": 9.746341478068298e-07,
"loss": 0.0122,
"step": 9520
},
{
"epoch": 4.793762575452717,
"grad_norm": 0.2078647017478943,
"learning_rate": 9.288466398361783e-07,
"loss": 0.0098,
"step": 9530
},
{
"epoch": 4.798792756539235,
"grad_norm": 0.137592151761055,
"learning_rate": 8.841556771822746e-07,
"loss": 0.0142,
"step": 9540
},
{
"epoch": 4.803822937625754,
"grad_norm": 0.1908939927816391,
"learning_rate": 8.405617544963385e-07,
"loss": 0.0143,
"step": 9550
},
{
"epoch": 4.808853118712274,
"grad_norm": 0.17629876732826233,
"learning_rate": 7.980653542872584e-07,
"loss": 0.0097,
"step": 9560
},
{
"epoch": 4.813883299798793,
"grad_norm": 0.11937274038791656,
"learning_rate": 7.566669469162513e-07,
"loss": 0.0126,
"step": 9570
},
{
"epoch": 4.818913480885312,
"grad_norm": 0.21683438122272491,
"learning_rate": 7.16366990591677e-07,
"loss": 0.0089,
"step": 9580
},
{
"epoch": 4.823943661971831,
"grad_norm": 0.1359615921974182,
"learning_rate": 6.771659313639212e-07,
"loss": 0.01,
"step": 9590
},
{
"epoch": 4.82897384305835,
"grad_norm": 0.23794005811214447,
"learning_rate": 6.390642031205318e-07,
"loss": 0.0183,
"step": 9600
},
{
"epoch": 4.834004024144869,
"grad_norm": 0.10441375523805618,
"learning_rate": 6.020622275813459e-07,
"loss": 0.0107,
"step": 9610
},
{
"epoch": 4.839034205231388,
"grad_norm": 0.2170896977186203,
"learning_rate": 5.661604142938703e-07,
"loss": 0.0144,
"step": 9620
},
{
"epoch": 4.844064386317908,
"grad_norm": 0.1544094830751419,
"learning_rate": 5.313591606287194e-07,
"loss": 0.0095,
"step": 9630
},
{
"epoch": 4.849094567404427,
"grad_norm": 0.09597515314817429,
"learning_rate": 4.976588517752178e-07,
"loss": 0.0078,
"step": 9640
},
{
"epoch": 4.854124748490945,
"grad_norm": 0.18722732365131378,
"learning_rate": 4.6505986073717143e-07,
"loss": 0.0167,
"step": 9650
},
{
"epoch": 4.859154929577465,
"grad_norm": 0.15623292326927185,
"learning_rate": 4.3356254832869204e-07,
"loss": 0.0122,
"step": 9660
},
{
"epoch": 4.864185110663984,
"grad_norm": 0.19763480126857758,
"learning_rate": 4.0316726317023435e-07,
"loss": 0.0274,
"step": 9670
},
{
"epoch": 4.869215291750503,
"grad_norm": 0.0955481231212616,
"learning_rate": 3.7387434168473235e-07,
"loss": 0.0147,
"step": 9680
},
{
"epoch": 4.874245472837022,
"grad_norm": 0.18850040435791016,
"learning_rate": 3.4568410809385774e-07,
"loss": 0.0146,
"step": 9690
},
{
"epoch": 4.879275653923541,
"grad_norm": 0.2946015000343323,
"learning_rate": 3.185968744144563e-07,
"loss": 0.0107,
"step": 9700
},
{
"epoch": 4.8843058350100605,
"grad_norm": 0.29948338866233826,
"learning_rate": 2.926129404550837e-07,
"loss": 0.0088,
"step": 9710
},
{
"epoch": 4.889336016096579,
"grad_norm": 0.23983007669448853,
"learning_rate": 2.6773259381268625e-07,
"loss": 0.0176,
"step": 9720
},
{
"epoch": 4.894366197183099,
"grad_norm": 0.11162281781435013,
"learning_rate": 2.439561098694254e-07,
"loss": 0.0112,
"step": 9730
},
{
"epoch": 4.899396378269618,
"grad_norm": 0.1520148515701294,
"learning_rate": 2.212837517896027e-07,
"loss": 0.0105,
"step": 9740
},
{
"epoch": 4.9044265593561365,
"grad_norm": 0.1838511973619461,
"learning_rate": 1.9971577051678404e-07,
"loss": 0.0193,
"step": 9750
},
{
"epoch": 4.909456740442656,
"grad_norm": 0.1083097979426384,
"learning_rate": 1.7925240477100203e-07,
"loss": 0.0142,
"step": 9760
},
{
"epoch": 4.914486921529175,
"grad_norm": 0.16764095425605774,
"learning_rate": 1.598938810461137e-07,
"loss": 0.0102,
"step": 9770
},
{
"epoch": 4.9195171026156945,
"grad_norm": 0.22907580435276031,
"learning_rate": 1.416404136073024e-07,
"loss": 0.0144,
"step": 9780
},
{
"epoch": 4.924547283702213,
"grad_norm": 0.2832927703857422,
"learning_rate": 1.2449220448870204e-07,
"loss": 0.0134,
"step": 9790
},
{
"epoch": 4.929577464788732,
"grad_norm": 0.10838713496923447,
"learning_rate": 1.0844944349114316e-07,
"loss": 0.01,
"step": 9800
},
{
"epoch": 4.934607645875252,
"grad_norm": 0.0964408740401268,
"learning_rate": 9.351230818008815e-08,
"loss": 0.0109,
"step": 9810
},
{
"epoch": 4.9396378269617705,
"grad_norm": 0.1444048136472702,
"learning_rate": 7.968096388364377e-08,
"loss": 0.0113,
"step": 9820
},
{
"epoch": 4.94466800804829,
"grad_norm": 0.09675177931785583,
"learning_rate": 6.69555636907182e-08,
"loss": 0.0104,
"step": 9830
},
{
"epoch": 4.949698189134809,
"grad_norm": 0.14633896946907043,
"learning_rate": 5.533624844936691e-08,
"loss": 0.009,
"step": 9840
},
{
"epoch": 4.954728370221328,
"grad_norm": 0.27734798192977905,
"learning_rate": 4.4823146765182735e-08,
"loss": 0.011,
"step": 9850
},
{
"epoch": 4.959758551307847,
"grad_norm": 0.2644200921058655,
"learning_rate": 3.5416374999919235e-08,
"loss": 0.0124,
"step": 9860
},
{
"epoch": 4.964788732394366,
"grad_norm": 0.24501149356365204,
"learning_rate": 2.7116037270169538e-08,
"loss": 0.0123,
"step": 9870
},
{
"epoch": 4.969818913480886,
"grad_norm": 0.24458244442939758,
"learning_rate": 1.9922225446245e-08,
"loss": 0.0146,
"step": 9880
},
{
"epoch": 4.974849094567404,
"grad_norm": 0.06575839966535568,
"learning_rate": 1.383501915112051e-08,
"loss": 0.0089,
"step": 9890
},
{
"epoch": 4.979879275653923,
"grad_norm": 0.2329985499382019,
"learning_rate": 8.854485759568487e-09,
"loss": 0.0136,
"step": 9900
},
{
"epoch": 4.984909456740443,
"grad_norm": 0.17096395790576935,
"learning_rate": 4.980680397448367e-09,
"loss": 0.0118,
"step": 9910
},
{
"epoch": 4.989939637826962,
"grad_norm": 0.16430173814296722,
"learning_rate": 2.213645941029352e-09,
"loss": 0.0088,
"step": 9920
},
{
"epoch": 4.994969818913481,
"grad_norm": 0.23126807808876038,
"learning_rate": 5.534130165907314e-10,
"loss": 0.0184,
"step": 9930
},
{
"epoch": 5.0,
"grad_norm": 0.1565922051668167,
"learning_rate": 0.0,
"loss": 0.0118,
"step": 9940
},
{
"epoch": 5.0,
"step": 9940,
"total_flos": 3.545130433511883e+17,
"train_loss": 0.029606477042259105,
"train_runtime": 4441.4052,
"train_samples_per_second": 35.808,
"train_steps_per_second": 2.238
}
],
"logging_steps": 10,
"max_steps": 9940,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.545130433511883e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}