so100_test7-27uf0d4pij / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
770b7bc verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 2735,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018281535648994516,
"grad_norm": 16.712358474731445,
"learning_rate": 1.45985401459854e-05,
"loss": 1.4822,
"step": 10
},
{
"epoch": 0.03656307129798903,
"grad_norm": 7.676208019256592,
"learning_rate": 2.91970802919708e-05,
"loss": 0.7397,
"step": 20
},
{
"epoch": 0.054844606946983544,
"grad_norm": 2.2206971645355225,
"learning_rate": 4.379562043795621e-05,
"loss": 0.4701,
"step": 30
},
{
"epoch": 0.07312614259597806,
"grad_norm": 1.7638039588928223,
"learning_rate": 5.83941605839416e-05,
"loss": 0.2966,
"step": 40
},
{
"epoch": 0.09140767824497258,
"grad_norm": 1.6052724123001099,
"learning_rate": 7.299270072992701e-05,
"loss": 0.2162,
"step": 50
},
{
"epoch": 0.10968921389396709,
"grad_norm": 2.617760181427002,
"learning_rate": 8.759124087591242e-05,
"loss": 0.2019,
"step": 60
},
{
"epoch": 0.12797074954296161,
"grad_norm": 1.7860541343688965,
"learning_rate": 0.00010218978102189782,
"loss": 0.1457,
"step": 70
},
{
"epoch": 0.14625228519195613,
"grad_norm": 1.4474908113479614,
"learning_rate": 0.0001167883211678832,
"loss": 0.1387,
"step": 80
},
{
"epoch": 0.16453382084095064,
"grad_norm": 1.5035394430160522,
"learning_rate": 0.0001313868613138686,
"loss": 0.1363,
"step": 90
},
{
"epoch": 0.18281535648994515,
"grad_norm": 1.4598884582519531,
"learning_rate": 0.00014598540145985403,
"loss": 0.1124,
"step": 100
},
{
"epoch": 0.20109689213893966,
"grad_norm": 1.7308577299118042,
"learning_rate": 0.00016058394160583942,
"loss": 0.1215,
"step": 110
},
{
"epoch": 0.21937842778793418,
"grad_norm": 1.5704491138458252,
"learning_rate": 0.00017518248175182484,
"loss": 0.1332,
"step": 120
},
{
"epoch": 0.2376599634369287,
"grad_norm": 0.7519080638885498,
"learning_rate": 0.00018978102189781023,
"loss": 0.1015,
"step": 130
},
{
"epoch": 0.25594149908592323,
"grad_norm": 1.0302314758300781,
"learning_rate": 0.00019999934198849153,
"loss": 0.1043,
"step": 140
},
{
"epoch": 0.2742230347349177,
"grad_norm": 1.1439878940582275,
"learning_rate": 0.00019998764424701714,
"loss": 0.1105,
"step": 150
},
{
"epoch": 0.29250457038391225,
"grad_norm": 0.8649179935455322,
"learning_rate": 0.00019996132599641746,
"loss": 0.0969,
"step": 160
},
{
"epoch": 0.31078610603290674,
"grad_norm": 0.9194239377975464,
"learning_rate": 0.00019992039108503024,
"loss": 0.097,
"step": 170
},
{
"epoch": 0.3290676416819013,
"grad_norm": 0.6259992718696594,
"learning_rate": 0.00019986484549848745,
"loss": 0.0853,
"step": 180
},
{
"epoch": 0.3473491773308958,
"grad_norm": 1.0033239126205444,
"learning_rate": 0.00019979469735884026,
"loss": 0.0944,
"step": 190
},
{
"epoch": 0.3656307129798903,
"grad_norm": 1.261385440826416,
"learning_rate": 0.00019970995692337114,
"loss": 0.1078,
"step": 200
},
{
"epoch": 0.38391224862888484,
"grad_norm": 0.9231658577919006,
"learning_rate": 0.00019961063658309418,
"loss": 0.0821,
"step": 210
},
{
"epoch": 0.40219378427787933,
"grad_norm": 0.996103048324585,
"learning_rate": 0.00019949675086094326,
"loss": 0.0911,
"step": 220
},
{
"epoch": 0.42047531992687387,
"grad_norm": 0.9832742810249329,
"learning_rate": 0.0001993683164096483,
"loss": 0.0692,
"step": 230
},
{
"epoch": 0.43875685557586835,
"grad_norm": 0.6472922563552856,
"learning_rate": 0.00019922535200930046,
"loss": 0.0706,
"step": 240
},
{
"epoch": 0.4570383912248629,
"grad_norm": 0.5999054312705994,
"learning_rate": 0.00019906787856460581,
"loss": 0.0731,
"step": 250
},
{
"epoch": 0.4753199268738574,
"grad_norm": 0.667738139629364,
"learning_rate": 0.00019889591910182876,
"loss": 0.0708,
"step": 260
},
{
"epoch": 0.4936014625228519,
"grad_norm": 0.554964542388916,
"learning_rate": 0.0001987094987654251,
"loss": 0.0591,
"step": 270
},
{
"epoch": 0.5118829981718465,
"grad_norm": 1.1600011587142944,
"learning_rate": 0.00019850864481436514,
"loss": 0.0795,
"step": 280
},
{
"epoch": 0.5301645338208409,
"grad_norm": 0.6419970393180847,
"learning_rate": 0.00019829338661814797,
"loss": 0.0659,
"step": 290
},
{
"epoch": 0.5484460694698354,
"grad_norm": 0.735856831073761,
"learning_rate": 0.00019806375565250685,
"loss": 0.0724,
"step": 300
},
{
"epoch": 0.56672760511883,
"grad_norm": 0.5395373106002808,
"learning_rate": 0.00019781978549480682,
"loss": 0.0626,
"step": 310
},
{
"epoch": 0.5850091407678245,
"grad_norm": 0.8947715759277344,
"learning_rate": 0.00019756151181913483,
"loss": 0.0601,
"step": 320
},
{
"epoch": 0.603290676416819,
"grad_norm": 0.5075414180755615,
"learning_rate": 0.00019728897239108342,
"loss": 0.0691,
"step": 330
},
{
"epoch": 0.6215722120658135,
"grad_norm": 1.3236219882965088,
"learning_rate": 0.00019700220706222858,
"loss": 0.0488,
"step": 340
},
{
"epoch": 0.6398537477148081,
"grad_norm": 0.9153704047203064,
"learning_rate": 0.00019670125776430228,
"loss": 0.0622,
"step": 350
},
{
"epoch": 0.6581352833638026,
"grad_norm": 0.6496918797492981,
"learning_rate": 0.00019638616850306133,
"loss": 0.0572,
"step": 360
},
{
"epoch": 0.676416819012797,
"grad_norm": 0.6905117034912109,
"learning_rate": 0.00019605698535185266,
"loss": 0.0506,
"step": 370
},
{
"epoch": 0.6946983546617916,
"grad_norm": 0.6502402424812317,
"learning_rate": 0.00019571375644487625,
"loss": 0.0528,
"step": 380
},
{
"epoch": 0.7129798903107861,
"grad_norm": 0.7400691509246826,
"learning_rate": 0.0001953565319701469,
"loss": 0.0674,
"step": 390
},
{
"epoch": 0.7312614259597806,
"grad_norm": 0.5896055698394775,
"learning_rate": 0.0001949853641621555,
"loss": 0.0471,
"step": 400
},
{
"epoch": 0.7495429616087751,
"grad_norm": 0.4026470482349396,
"learning_rate": 0.00019460030729423114,
"loss": 0.0512,
"step": 410
},
{
"epoch": 0.7678244972577697,
"grad_norm": 0.47957828640937805,
"learning_rate": 0.0001942014176706052,
"loss": 0.0629,
"step": 420
},
{
"epoch": 0.7861060329067642,
"grad_norm": 0.4520862400531769,
"learning_rate": 0.00019378875361817817,
"loss": 0.0533,
"step": 430
},
{
"epoch": 0.8043875685557587,
"grad_norm": 0.4732885956764221,
"learning_rate": 0.00019336237547799108,
"loss": 0.058,
"step": 440
},
{
"epoch": 0.8226691042047533,
"grad_norm": 0.7703008651733398,
"learning_rate": 0.0001929223455964022,
"loss": 0.0532,
"step": 450
},
{
"epoch": 0.8409506398537477,
"grad_norm": 0.45097994804382324,
"learning_rate": 0.00019246872831597055,
"loss": 0.0465,
"step": 460
},
{
"epoch": 0.8592321755027422,
"grad_norm": 0.5736098289489746,
"learning_rate": 0.00019200158996604753,
"loss": 0.0487,
"step": 470
},
{
"epoch": 0.8775137111517367,
"grad_norm": 0.7237376570701599,
"learning_rate": 0.0001915209988530779,
"loss": 0.0551,
"step": 480
},
{
"epoch": 0.8957952468007313,
"grad_norm": 0.4645770192146301,
"learning_rate": 0.00019102702525061207,
"loss": 0.0495,
"step": 490
},
{
"epoch": 0.9140767824497258,
"grad_norm": 0.5169672966003418,
"learning_rate": 0.00019051974138903027,
"loss": 0.0433,
"step": 500
},
{
"epoch": 0.9323583180987203,
"grad_norm": 0.7457365989685059,
"learning_rate": 0.00018999922144498084,
"loss": 0.0518,
"step": 510
},
{
"epoch": 0.9506398537477148,
"grad_norm": 0.5059699416160583,
"learning_rate": 0.00018946554153053395,
"loss": 0.0474,
"step": 520
},
{
"epoch": 0.9689213893967094,
"grad_norm": 0.8174113035202026,
"learning_rate": 0.00018891877968205213,
"loss": 0.0517,
"step": 530
},
{
"epoch": 0.9872029250457038,
"grad_norm": 0.5508332252502441,
"learning_rate": 0.00018835901584877973,
"loss": 0.0709,
"step": 540
},
{
"epoch": 1.0054844606946984,
"grad_norm": 0.5709052681922913,
"learning_rate": 0.00018778633188115223,
"loss": 0.0484,
"step": 550
},
{
"epoch": 1.023765996343693,
"grad_norm": 0.4354308247566223,
"learning_rate": 0.0001872008115188281,
"loss": 0.0544,
"step": 560
},
{
"epoch": 1.0420475319926874,
"grad_norm": 0.535977303981781,
"learning_rate": 0.00018660254037844388,
"loss": 0.0562,
"step": 570
},
{
"epoch": 1.0603290676416819,
"grad_norm": 0.2939574420452118,
"learning_rate": 0.00018599160594109522,
"loss": 0.0489,
"step": 580
},
{
"epoch": 1.0786106032906764,
"grad_norm": 0.3677907884120941,
"learning_rate": 0.000185368097539545,
"loss": 0.0358,
"step": 590
},
{
"epoch": 1.0968921389396709,
"grad_norm": 0.5382636785507202,
"learning_rate": 0.0001847321063451609,
"loss": 0.0395,
"step": 600
},
{
"epoch": 1.1151736745886653,
"grad_norm": 0.457963764667511,
"learning_rate": 0.00018408372535458397,
"loss": 0.0523,
"step": 610
},
{
"epoch": 1.13345521023766,
"grad_norm": 0.5560534000396729,
"learning_rate": 0.00018342304937613032,
"loss": 0.0531,
"step": 620
},
{
"epoch": 1.1517367458866545,
"grad_norm": 0.6328279376029968,
"learning_rate": 0.00018275017501592818,
"loss": 0.0452,
"step": 630
},
{
"epoch": 1.170018281535649,
"grad_norm": 0.45685553550720215,
"learning_rate": 0.0001820652006637915,
"loss": 0.0402,
"step": 640
},
{
"epoch": 1.1882998171846435,
"grad_norm": 0.21566231548786163,
"learning_rate": 0.0001813682264788334,
"loss": 0.0401,
"step": 650
},
{
"epoch": 1.206581352833638,
"grad_norm": 0.36770665645599365,
"learning_rate": 0.00018065935437482037,
"loss": 0.04,
"step": 660
},
{
"epoch": 1.2248628884826325,
"grad_norm": 0.4096185863018036,
"learning_rate": 0.0001799386880052703,
"loss": 0.0352,
"step": 670
},
{
"epoch": 1.2431444241316272,
"grad_norm": 0.4246453642845154,
"learning_rate": 0.00017920633274829575,
"loss": 0.045,
"step": 680
},
{
"epoch": 1.2614259597806217,
"grad_norm": 0.4160013496875763,
"learning_rate": 0.00017846239569119528,
"loss": 0.0357,
"step": 690
},
{
"epoch": 1.2797074954296161,
"grad_norm": 0.5409733653068542,
"learning_rate": 0.00017770698561479496,
"loss": 0.0376,
"step": 700
},
{
"epoch": 1.2979890310786106,
"grad_norm": 0.22224466502666473,
"learning_rate": 0.00017694021297754188,
"loss": 0.041,
"step": 710
},
{
"epoch": 1.3162705667276051,
"grad_norm": 0.5606803894042969,
"learning_rate": 0.00017616218989935272,
"loss": 0.0367,
"step": 720
},
{
"epoch": 1.3345521023765996,
"grad_norm": 0.3131175935268402,
"learning_rate": 0.00017537303014521918,
"loss": 0.0466,
"step": 730
},
{
"epoch": 1.352833638025594,
"grad_norm": 0.37444230914115906,
"learning_rate": 0.0001745728491085728,
"loss": 0.0401,
"step": 740
},
{
"epoch": 1.3711151736745886,
"grad_norm": 0.6337727308273315,
"learning_rate": 0.0001737617637944119,
"loss": 0.0505,
"step": 750
},
{
"epoch": 1.389396709323583,
"grad_norm": 0.5669440627098083,
"learning_rate": 0.00017293989280219274,
"loss": 0.0372,
"step": 760
},
{
"epoch": 1.4076782449725778,
"grad_norm": 0.388346791267395,
"learning_rate": 0.00017210735630848745,
"loss": 0.035,
"step": 770
},
{
"epoch": 1.4259597806215722,
"grad_norm": 0.5280373096466064,
"learning_rate": 0.00017126427604941148,
"loss": 0.0466,
"step": 780
},
{
"epoch": 1.4442413162705667,
"grad_norm": 0.565298855304718,
"learning_rate": 0.00017041077530282294,
"loss": 0.0365,
"step": 790
},
{
"epoch": 1.4625228519195612,
"grad_norm": 0.35680803656578064,
"learning_rate": 0.00016954697887029655,
"loss": 0.0383,
"step": 800
},
{
"epoch": 1.4808043875685557,
"grad_norm": 0.42788997292518616,
"learning_rate": 0.00016867301305887474,
"loss": 0.0337,
"step": 810
},
{
"epoch": 1.4990859232175504,
"grad_norm": 0.43233945965766907,
"learning_rate": 0.00016778900566259865,
"loss": 0.0505,
"step": 820
},
{
"epoch": 1.517367458866545,
"grad_norm": 0.4589940905570984,
"learning_rate": 0.0001668950859438216,
"loss": 0.0438,
"step": 830
},
{
"epoch": 1.5356489945155394,
"grad_norm": 0.48594310879707336,
"learning_rate": 0.00016599138461430814,
"loss": 0.0323,
"step": 840
},
{
"epoch": 1.5539305301645339,
"grad_norm": 0.31333279609680176,
"learning_rate": 0.00016507803381612076,
"loss": 0.0393,
"step": 850
},
{
"epoch": 1.5722120658135283,
"grad_norm": 0.49847719073295593,
"learning_rate": 0.00016415516710229766,
"loss": 0.0453,
"step": 860
},
{
"epoch": 1.5904936014625228,
"grad_norm": 0.4276566505432129,
"learning_rate": 0.00016322291941732442,
"loss": 0.0362,
"step": 870
},
{
"epoch": 1.6087751371115173,
"grad_norm": 0.47734275460243225,
"learning_rate": 0.0001622814270774018,
"loss": 0.0349,
"step": 880
},
{
"epoch": 1.6270566727605118,
"grad_norm": 0.24307364225387573,
"learning_rate": 0.00016133082775051313,
"loss": 0.0365,
"step": 890
},
{
"epoch": 1.6453382084095063,
"grad_norm": 0.4327755272388458,
"learning_rate": 0.00016037126043629422,
"loss": 0.0318,
"step": 900
},
{
"epoch": 1.6636197440585008,
"grad_norm": 0.2253831923007965,
"learning_rate": 0.0001594028654457083,
"loss": 0.0324,
"step": 910
},
{
"epoch": 1.6819012797074955,
"grad_norm": 0.42007511854171753,
"learning_rate": 0.0001584257843805293,
"loss": 0.0387,
"step": 920
},
{
"epoch": 1.70018281535649,
"grad_norm": 0.5654010772705078,
"learning_rate": 0.00015744016011263638,
"loss": 0.0461,
"step": 930
},
{
"epoch": 1.7184643510054844,
"grad_norm": 0.5979740619659424,
"learning_rate": 0.00015644613676312288,
"loss": 0.0288,
"step": 940
},
{
"epoch": 1.736745886654479,
"grad_norm": 0.6250779628753662,
"learning_rate": 0.00015544385968122227,
"loss": 0.0339,
"step": 950
},
{
"epoch": 1.7550274223034736,
"grad_norm": 0.4420310854911804,
"learning_rate": 0.00015443347542305484,
"loss": 0.0446,
"step": 960
},
{
"epoch": 1.7733089579524681,
"grad_norm": 0.4242953956127167,
"learning_rate": 0.0001534151317301979,
"loss": 0.0402,
"step": 970
},
{
"epoch": 1.7915904936014626,
"grad_norm": 0.2853521406650543,
"learning_rate": 0.00015238897750808242,
"loss": 0.0367,
"step": 980
},
{
"epoch": 1.809872029250457,
"grad_norm": 0.5415486693382263,
"learning_rate": 0.00015135516280421945,
"loss": 0.0312,
"step": 990
},
{
"epoch": 1.8281535648994516,
"grad_norm": 0.3944428265094757,
"learning_rate": 0.00015031383878626016,
"loss": 0.0293,
"step": 1000
},
{
"epoch": 1.846435100548446,
"grad_norm": 0.42964455485343933,
"learning_rate": 0.00014926515771989104,
"loss": 0.0462,
"step": 1010
},
{
"epoch": 1.8647166361974405,
"grad_norm": 0.3574308454990387,
"learning_rate": 0.00014820927294656973,
"loss": 0.0358,
"step": 1020
},
{
"epoch": 1.882998171846435,
"grad_norm": 0.38193315267562866,
"learning_rate": 0.00014714633886110242,
"loss": 0.0393,
"step": 1030
},
{
"epoch": 1.9012797074954295,
"grad_norm": 0.4956030249595642,
"learning_rate": 0.00014607651088906809,
"loss": 0.0312,
"step": 1040
},
{
"epoch": 1.919561243144424,
"grad_norm": 0.4244064688682556,
"learning_rate": 0.00014499994546409152,
"loss": 0.031,
"step": 1050
},
{
"epoch": 1.9378427787934185,
"grad_norm": 0.46385011076927185,
"learning_rate": 0.00014391680000496932,
"loss": 0.0424,
"step": 1060
},
{
"epoch": 1.9561243144424132,
"grad_norm": 0.5440361499786377,
"learning_rate": 0.0001428272328926512,
"loss": 0.0328,
"step": 1070
},
{
"epoch": 1.9744058500914077,
"grad_norm": 0.3221015930175781,
"learning_rate": 0.00014173140344708152,
"loss": 0.0424,
"step": 1080
},
{
"epoch": 1.9926873857404022,
"grad_norm": 0.520367443561554,
"learning_rate": 0.00014062947190390262,
"loss": 0.0396,
"step": 1090
},
{
"epoch": 2.010968921389397,
"grad_norm": 0.29480573534965515,
"learning_rate": 0.0001395215993910249,
"loss": 0.0351,
"step": 1100
},
{
"epoch": 2.0292504570383914,
"grad_norm": 0.35179761052131653,
"learning_rate": 0.00013840794790506616,
"loss": 0.0271,
"step": 1110
},
{
"epoch": 2.047531992687386,
"grad_norm": 0.377270370721817,
"learning_rate": 0.00013728868028766377,
"loss": 0.0311,
"step": 1120
},
{
"epoch": 2.0658135283363803,
"grad_norm": 0.4772701859474182,
"learning_rate": 0.0001361639602016637,
"loss": 0.0372,
"step": 1130
},
{
"epoch": 2.084095063985375,
"grad_norm": 0.30298906564712524,
"learning_rate": 0.000135033952107189,
"loss": 0.0255,
"step": 1140
},
{
"epoch": 2.1023765996343693,
"grad_norm": 0.39370113611221313,
"learning_rate": 0.00013389882123759206,
"loss": 0.0327,
"step": 1150
},
{
"epoch": 2.1206581352833638,
"grad_norm": 0.2912181317806244,
"learning_rate": 0.00013275873357529368,
"loss": 0.0268,
"step": 1160
},
{
"epoch": 2.1389396709323583,
"grad_norm": 0.29357820749282837,
"learning_rate": 0.00013161385582751247,
"loss": 0.0273,
"step": 1170
},
{
"epoch": 2.1572212065813527,
"grad_norm": 0.3242945075035095,
"learning_rate": 0.00013046435540188848,
"loss": 0.0296,
"step": 1180
},
{
"epoch": 2.1755027422303472,
"grad_norm": 1.168150544166565,
"learning_rate": 0.00012931040038200435,
"loss": 0.0416,
"step": 1190
},
{
"epoch": 2.1937842778793417,
"grad_norm": 0.3501128852367401,
"learning_rate": 0.00012815215950280753,
"loss": 0.0379,
"step": 1200
},
{
"epoch": 2.212065813528336,
"grad_norm": 0.46127256751060486,
"learning_rate": 0.0001269898021259373,
"loss": 0.0372,
"step": 1210
},
{
"epoch": 2.2303473491773307,
"grad_norm": 0.4480052888393402,
"learning_rate": 0.0001258234982149604,
"loss": 0.0366,
"step": 1220
},
{
"epoch": 2.2486288848263256,
"grad_norm": 0.38535383343696594,
"learning_rate": 0.0001246534183105181,
"loss": 0.0289,
"step": 1230
},
{
"epoch": 2.26691042047532,
"grad_norm": 0.39918404817581177,
"learning_rate": 0.00012347973350538936,
"loss": 0.029,
"step": 1240
},
{
"epoch": 2.2851919561243146,
"grad_norm": 0.27928563952445984,
"learning_rate": 0.00012230261541947316,
"loss": 0.0262,
"step": 1250
},
{
"epoch": 2.303473491773309,
"grad_norm": 0.43867453932762146,
"learning_rate": 0.00012112223617469372,
"loss": 0.0227,
"step": 1260
},
{
"epoch": 2.3217550274223036,
"grad_norm": 0.3848976194858551,
"learning_rate": 0.00011993876836983198,
"loss": 0.0251,
"step": 1270
},
{
"epoch": 2.340036563071298,
"grad_norm": 0.3365519046783447,
"learning_rate": 0.0001187523850552881,
"loss": 0.0345,
"step": 1280
},
{
"epoch": 2.3583180987202925,
"grad_norm": 0.3406737446784973,
"learning_rate": 0.00011756325970777717,
"loss": 0.0273,
"step": 1290
},
{
"epoch": 2.376599634369287,
"grad_norm": 0.28142690658569336,
"learning_rate": 0.00011637156620496308,
"loss": 0.0275,
"step": 1300
},
{
"epoch": 2.3948811700182815,
"grad_norm": 0.36976391077041626,
"learning_rate": 0.00011517747880003335,
"loss": 0.0243,
"step": 1310
},
{
"epoch": 2.413162705667276,
"grad_norm": 0.22825664281845093,
"learning_rate": 0.00011398117209621966,
"loss": 0.0278,
"step": 1320
},
{
"epoch": 2.4314442413162705,
"grad_norm": 0.3394540548324585,
"learning_rate": 0.00011278282102126633,
"loss": 0.0357,
"step": 1330
},
{
"epoch": 2.449725776965265,
"grad_norm": 0.26682886481285095,
"learning_rate": 0.00011158260080185226,
"loss": 0.0407,
"step": 1340
},
{
"epoch": 2.4680073126142594,
"grad_norm": 0.23459388315677643,
"learning_rate": 0.00011038068693796846,
"loss": 0.0263,
"step": 1350
},
{
"epoch": 2.4862888482632544,
"grad_norm": 0.32797005772590637,
"learning_rate": 0.00010917725517725608,
"loss": 0.0354,
"step": 1360
},
{
"epoch": 2.504570383912249,
"grad_norm": 0.2672847509384155,
"learning_rate": 0.00010797248148930783,
"loss": 0.0203,
"step": 1370
},
{
"epoch": 2.5228519195612433,
"grad_norm": 0.34542036056518555,
"learning_rate": 0.00010676654203993732,
"loss": 0.0246,
"step": 1380
},
{
"epoch": 2.541133455210238,
"grad_norm": 0.5064176321029663,
"learning_rate": 0.00010555961316541946,
"loss": 0.0276,
"step": 1390
},
{
"epoch": 2.5594149908592323,
"grad_norm": 0.34542617201805115,
"learning_rate": 0.00010435187134670607,
"loss": 0.0238,
"step": 1400
},
{
"epoch": 2.577696526508227,
"grad_norm": 0.3336438238620758,
"learning_rate": 0.00010314349318362015,
"loss": 0.0353,
"step": 1410
},
{
"epoch": 2.5959780621572213,
"grad_norm": 0.22887752950191498,
"learning_rate": 0.00010193465536903307,
"loss": 0.028,
"step": 1420
},
{
"epoch": 2.6142595978062158,
"grad_norm": 0.1399448662996292,
"learning_rate": 0.00010072553466302784,
"loss": 0.028,
"step": 1430
},
{
"epoch": 2.6325411334552102,
"grad_norm": 0.36335644125938416,
"learning_rate": 9.951630786705279e-05,
"loss": 0.0196,
"step": 1440
},
{
"epoch": 2.6508226691042047,
"grad_norm": 0.22947153449058533,
"learning_rate": 9.830715179806905e-05,
"loss": 0.0275,
"step": 1450
},
{
"epoch": 2.669104204753199,
"grad_norm": 0.21563003957271576,
"learning_rate": 9.709824326269576e-05,
"loss": 0.0216,
"step": 1460
},
{
"epoch": 2.6873857404021937,
"grad_norm": 0.3260309100151062,
"learning_rate": 9.5889759031357e-05,
"loss": 0.018,
"step": 1470
},
{
"epoch": 2.705667276051188,
"grad_norm": 0.15418443083763123,
"learning_rate": 9.468187581243378e-05,
"loss": 0.0244,
"step": 1480
},
{
"epoch": 2.7239488117001827,
"grad_norm": 0.2873231768608093,
"learning_rate": 9.347477022642503e-05,
"loss": 0.0186,
"step": 1490
},
{
"epoch": 2.742230347349177,
"grad_norm": 0.2715139091014862,
"learning_rate": 9.226861878012197e-05,
"loss": 0.0273,
"step": 1500
},
{
"epoch": 2.7605118829981716,
"grad_norm": 0.17074620723724365,
"learning_rate": 9.106359784079832e-05,
"loss": 0.0174,
"step": 1510
},
{
"epoch": 2.778793418647166,
"grad_norm": 0.2897492051124573,
"learning_rate": 8.985988361042153e-05,
"loss": 0.0283,
"step": 1520
},
{
"epoch": 2.797074954296161,
"grad_norm": 0.5155644416809082,
"learning_rate": 8.8657652099888e-05,
"loss": 0.0216,
"step": 1530
},
{
"epoch": 2.8153564899451555,
"grad_norm": 0.33276352286338806,
"learning_rate": 8.745707910328615e-05,
"loss": 0.0245,
"step": 1540
},
{
"epoch": 2.83363802559415,
"grad_norm": 0.4756206274032593,
"learning_rate": 8.625834017219113e-05,
"loss": 0.0303,
"step": 1550
},
{
"epoch": 2.8519195612431445,
"grad_norm": 0.2755451202392578,
"learning_rate": 8.506161058999541e-05,
"loss": 0.0199,
"step": 1560
},
{
"epoch": 2.870201096892139,
"grad_norm": 0.26369351148605347,
"learning_rate": 8.386706534627805e-05,
"loss": 0.0204,
"step": 1570
},
{
"epoch": 2.8884826325411335,
"grad_norm": 0.2358650118112564,
"learning_rate": 8.267487911121715e-05,
"loss": 0.0211,
"step": 1580
},
{
"epoch": 2.906764168190128,
"grad_norm": 0.22182169556617737,
"learning_rate": 8.148522621004926e-05,
"loss": 0.0233,
"step": 1590
},
{
"epoch": 2.9250457038391224,
"grad_norm": 0.30960527062416077,
"learning_rate": 8.029828059757875e-05,
"loss": 0.0243,
"step": 1600
},
{
"epoch": 2.943327239488117,
"grad_norm": 0.38207757472991943,
"learning_rate": 7.91142158327417e-05,
"loss": 0.0295,
"step": 1610
},
{
"epoch": 2.9616087751371114,
"grad_norm": 0.24521781504154205,
"learning_rate": 7.793320505322761e-05,
"loss": 0.0206,
"step": 1620
},
{
"epoch": 2.979890310786106,
"grad_norm": 0.3253994286060333,
"learning_rate": 7.675542095016256e-05,
"loss": 0.026,
"step": 1630
},
{
"epoch": 2.998171846435101,
"grad_norm": 0.3253840208053589,
"learning_rate": 7.558103574285779e-05,
"loss": 0.0219,
"step": 1640
},
{
"epoch": 3.016453382084095,
"grad_norm": 0.2342890352010727,
"learning_rate": 7.441022115362729e-05,
"loss": 0.0181,
"step": 1650
},
{
"epoch": 3.03473491773309,
"grad_norm": 0.2249564677476883,
"learning_rate": 7.324314838267796e-05,
"loss": 0.0228,
"step": 1660
},
{
"epoch": 3.0530164533820843,
"grad_norm": 0.24722999334335327,
"learning_rate": 7.207998808307628e-05,
"loss": 0.018,
"step": 1670
},
{
"epoch": 3.0712979890310788,
"grad_norm": 0.22779327630996704,
"learning_rate": 7.092091033579475e-05,
"loss": 0.0193,
"step": 1680
},
{
"epoch": 3.0895795246800732,
"grad_norm": 0.34452179074287415,
"learning_rate": 6.976608462484226e-05,
"loss": 0.0327,
"step": 1690
},
{
"epoch": 3.1078610603290677,
"grad_norm": 0.30508124828338623,
"learning_rate": 6.861567981248142e-05,
"loss": 0.0261,
"step": 1700
},
{
"epoch": 3.126142595978062,
"grad_norm": 0.319670706987381,
"learning_rate": 6.746986411453717e-05,
"loss": 0.0189,
"step": 1710
},
{
"epoch": 3.1444241316270567,
"grad_norm": 0.35580283403396606,
"learning_rate": 6.632880507579957e-05,
"loss": 0.0242,
"step": 1720
},
{
"epoch": 3.162705667276051,
"grad_norm": 0.3020285964012146,
"learning_rate": 6.519266954552502e-05,
"loss": 0.0176,
"step": 1730
},
{
"epoch": 3.1809872029250457,
"grad_norm": 0.27105554938316345,
"learning_rate": 6.406162365303882e-05,
"loss": 0.0268,
"step": 1740
},
{
"epoch": 3.19926873857404,
"grad_norm": 0.20928241312503815,
"learning_rate": 6.293583278344361e-05,
"loss": 0.0206,
"step": 1750
},
{
"epoch": 3.2175502742230346,
"grad_norm": 0.2314785271883011,
"learning_rate": 6.181546155343579e-05,
"loss": 0.0198,
"step": 1760
},
{
"epoch": 3.235831809872029,
"grad_norm": 0.2732461988925934,
"learning_rate": 6.070067378723501e-05,
"loss": 0.0177,
"step": 1770
},
{
"epoch": 3.2541133455210236,
"grad_norm": 0.17697979509830475,
"learning_rate": 5.959163249262913e-05,
"loss": 0.0155,
"step": 1780
},
{
"epoch": 3.272394881170018,
"grad_norm": 0.24429567158222198,
"learning_rate": 5.848849983713894e-05,
"loss": 0.0212,
"step": 1790
},
{
"epoch": 3.2906764168190126,
"grad_norm": 0.36660096049308777,
"learning_rate": 5.739143712430521e-05,
"loss": 0.0281,
"step": 1800
},
{
"epoch": 3.3089579524680075,
"grad_norm": 0.2895634174346924,
"learning_rate": 5.630060477010253e-05,
"loss": 0.018,
"step": 1810
},
{
"epoch": 3.327239488117002,
"grad_norm": 0.3412606418132782,
"learning_rate": 5.5216162279482964e-05,
"loss": 0.0134,
"step": 1820
},
{
"epoch": 3.3455210237659965,
"grad_norm": 0.22716091573238373,
"learning_rate": 5.4138268223052326e-05,
"loss": 0.016,
"step": 1830
},
{
"epoch": 3.363802559414991,
"grad_norm": 0.24945920705795288,
"learning_rate": 5.306708021388378e-05,
"loss": 0.0208,
"step": 1840
},
{
"epoch": 3.3820840950639854,
"grad_norm": 0.24487105011940002,
"learning_rate": 5.200275488447104e-05,
"loss": 0.018,
"step": 1850
},
{
"epoch": 3.40036563071298,
"grad_norm": 0.24816852807998657,
"learning_rate": 5.094544786382522e-05,
"loss": 0.0159,
"step": 1860
},
{
"epoch": 3.4186471663619744,
"grad_norm": 0.1848219782114029,
"learning_rate": 4.989531375471805e-05,
"loss": 0.0142,
"step": 1870
},
{
"epoch": 3.436928702010969,
"grad_norm": 0.19923894107341766,
"learning_rate": 4.885250611107558e-05,
"loss": 0.0214,
"step": 1880
},
{
"epoch": 3.4552102376599634,
"grad_norm": 0.1752861738204956,
"learning_rate": 4.7817177415524796e-05,
"loss": 0.0198,
"step": 1890
},
{
"epoch": 3.473491773308958,
"grad_norm": 0.3053307831287384,
"learning_rate": 4.678947905709744e-05,
"loss": 0.0225,
"step": 1900
},
{
"epoch": 3.4917733089579523,
"grad_norm": 0.19800381362438202,
"learning_rate": 4.576956130909317e-05,
"loss": 0.016,
"step": 1910
},
{
"epoch": 3.510054844606947,
"grad_norm": 0.1873503029346466,
"learning_rate": 4.475757330710621e-05,
"loss": 0.0144,
"step": 1920
},
{
"epoch": 3.5283363802559418,
"grad_norm": 0.23367895185947418,
"learning_rate": 4.375366302721825e-05,
"loss": 0.0161,
"step": 1930
},
{
"epoch": 3.5466179159049362,
"grad_norm": 0.17103944718837738,
"learning_rate": 4.2757977264361046e-05,
"loss": 0.0146,
"step": 1940
},
{
"epoch": 3.5648994515539307,
"grad_norm": 0.2473006546497345,
"learning_rate": 4.177066161085148e-05,
"loss": 0.0184,
"step": 1950
},
{
"epoch": 3.583180987202925,
"grad_norm": 0.31398236751556396,
"learning_rate": 4.0791860435102524e-05,
"loss": 0.0146,
"step": 1960
},
{
"epoch": 3.6014625228519197,
"grad_norm": 0.33835136890411377,
"learning_rate": 3.982171686051334e-05,
"loss": 0.021,
"step": 1970
},
{
"epoch": 3.619744058500914,
"grad_norm": 0.16258537769317627,
"learning_rate": 3.8860372744541407e-05,
"loss": 0.0196,
"step": 1980
},
{
"epoch": 3.6380255941499087,
"grad_norm": 0.3083174228668213,
"learning_rate": 3.790796865795947e-05,
"loss": 0.0152,
"step": 1990
},
{
"epoch": 3.656307129798903,
"grad_norm": 0.21282333135604858,
"learning_rate": 3.696464386430093e-05,
"loss": 0.0215,
"step": 2000
},
{
"epoch": 3.6745886654478976,
"grad_norm": 0.20800185203552246,
"learning_rate": 3.6030536299496395e-05,
"loss": 0.0155,
"step": 2010
},
{
"epoch": 3.692870201096892,
"grad_norm": 0.251663476228714,
"learning_rate": 3.5105782551704145e-05,
"loss": 0.0222,
"step": 2020
},
{
"epoch": 3.7111517367458866,
"grad_norm": 0.24097998440265656,
"learning_rate": 3.419051784133773e-05,
"loss": 0.0142,
"step": 2030
},
{
"epoch": 3.729433272394881,
"grad_norm": 0.18417520821094513,
"learning_rate": 3.328487600129371e-05,
"loss": 0.0147,
"step": 2040
},
{
"epoch": 3.7477148080438756,
"grad_norm": 0.18106205761432648,
"learning_rate": 3.2388989457382126e-05,
"loss": 0.0125,
"step": 2050
},
{
"epoch": 3.76599634369287,
"grad_norm": 0.14622414112091064,
"learning_rate": 3.1502989208962855e-05,
"loss": 0.0151,
"step": 2060
},
{
"epoch": 3.7842778793418645,
"grad_norm": 0.29628556966781616,
"learning_rate": 3.062700480979046e-05,
"loss": 0.0206,
"step": 2070
},
{
"epoch": 3.802559414990859,
"grad_norm": 0.26881730556488037,
"learning_rate": 2.9761164349070315e-05,
"loss": 0.0176,
"step": 2080
},
{
"epoch": 3.8208409506398535,
"grad_norm": 0.4180646240711212,
"learning_rate": 2.8905594432729055e-05,
"loss": 0.0179,
"step": 2090
},
{
"epoch": 3.839122486288848,
"grad_norm": 0.25500163435935974,
"learning_rate": 2.8060420164902012e-05,
"loss": 0.0142,
"step": 2100
},
{
"epoch": 3.857404021937843,
"grad_norm": 0.21968974173069,
"learning_rate": 2.7225765129639836e-05,
"loss": 0.0161,
"step": 2110
},
{
"epoch": 3.8756855575868374,
"grad_norm": 0.24668078124523163,
"learning_rate": 2.6401751372837813e-05,
"loss": 0.0217,
"step": 2120
},
{
"epoch": 3.893967093235832,
"grad_norm": 0.2258848249912262,
"learning_rate": 2.5588499384389865e-05,
"loss": 0.0178,
"step": 2130
},
{
"epoch": 3.9122486288848264,
"grad_norm": 0.1784961074590683,
"learning_rate": 2.478612808057018e-05,
"loss": 0.0114,
"step": 2140
},
{
"epoch": 3.930530164533821,
"grad_norm": 0.28832298517227173,
"learning_rate": 2.3994754786644923e-05,
"loss": 0.0109,
"step": 2150
},
{
"epoch": 3.9488117001828154,
"grad_norm": 0.12029292434453964,
"learning_rate": 2.3214495219716436e-05,
"loss": 0.0211,
"step": 2160
},
{
"epoch": 3.96709323583181,
"grad_norm": 0.19231897592544556,
"learning_rate": 2.2445463471802785e-05,
"loss": 0.0098,
"step": 2170
},
{
"epoch": 3.9853747714808043,
"grad_norm": 0.08982887864112854,
"learning_rate": 2.1687771993155004e-05,
"loss": 0.0077,
"step": 2180
},
{
"epoch": 4.003656307129799,
"grad_norm": 0.21466206014156342,
"learning_rate": 2.0941531575813988e-05,
"loss": 0.0159,
"step": 2190
},
{
"epoch": 4.021937842778794,
"grad_norm": 0.17917244136333466,
"learning_rate": 2.0206851337410415e-05,
"loss": 0.0139,
"step": 2200
},
{
"epoch": 4.040219378427788,
"grad_norm": 0.08883915841579437,
"learning_rate": 1.9483838705209012e-05,
"loss": 0.0152,
"step": 2210
},
{
"epoch": 4.058500914076783,
"grad_norm": 0.16674405336380005,
"learning_rate": 1.8772599400400258e-05,
"loss": 0.0196,
"step": 2220
},
{
"epoch": 4.076782449725777,
"grad_norm": 0.10342701524496078,
"learning_rate": 1.807323742264162e-05,
"loss": 0.0161,
"step": 2230
},
{
"epoch": 4.095063985374772,
"grad_norm": 0.1896440088748932,
"learning_rate": 1.7385855034850184e-05,
"loss": 0.0122,
"step": 2240
},
{
"epoch": 4.113345521023766,
"grad_norm": 0.16498374938964844,
"learning_rate": 1.6710552748249598e-05,
"loss": 0.0133,
"step": 2250
},
{
"epoch": 4.131627056672761,
"grad_norm": 0.17953291535377502,
"learning_rate": 1.604742930767298e-05,
"loss": 0.0219,
"step": 2260
},
{
"epoch": 4.149908592321755,
"grad_norm": 0.18694134056568146,
"learning_rate": 1.5396581677124124e-05,
"loss": 0.0169,
"step": 2270
},
{
"epoch": 4.16819012797075,
"grad_norm": 0.17348328232765198,
"learning_rate": 1.4758105025599068e-05,
"loss": 0.0159,
"step": 2280
},
{
"epoch": 4.186471663619744,
"grad_norm": 0.16517849266529083,
"learning_rate": 1.4132092713170242e-05,
"loss": 0.0137,
"step": 2290
},
{
"epoch": 4.204753199268739,
"grad_norm": 0.13645470142364502,
"learning_rate": 1.3518636277335084e-05,
"loss": 0.0149,
"step": 2300
},
{
"epoch": 4.223034734917733,
"grad_norm": 0.14027458429336548,
"learning_rate": 1.291782541963107e-05,
"loss": 0.0147,
"step": 2310
},
{
"epoch": 4.2413162705667276,
"grad_norm": 0.11632464081048965,
"learning_rate": 1.2329747992519269e-05,
"loss": 0.0137,
"step": 2320
},
{
"epoch": 4.259597806215722,
"grad_norm": 0.2426212579011917,
"learning_rate": 1.1754489986538419e-05,
"loss": 0.0117,
"step": 2330
},
{
"epoch": 4.2778793418647165,
"grad_norm": 0.20155277848243713,
"learning_rate": 1.1192135517730884e-05,
"loss": 0.0147,
"step": 2340
},
{
"epoch": 4.296160877513711,
"grad_norm": 0.14590322971343994,
"learning_rate": 1.0642766815343196e-05,
"loss": 0.0119,
"step": 2350
},
{
"epoch": 4.3144424131627055,
"grad_norm": 0.17194287478923798,
"learning_rate": 1.0106464209802013e-05,
"loss": 0.0115,
"step": 2360
},
{
"epoch": 4.3327239488117,
"grad_norm": 0.243038609623909,
"learning_rate": 9.583306120968072e-06,
"loss": 0.0153,
"step": 2370
},
{
"epoch": 4.3510054844606945,
"grad_norm": 0.29729005694389343,
"learning_rate": 9.0733690466694e-06,
"loss": 0.0136,
"step": 2380
},
{
"epoch": 4.369287020109689,
"grad_norm": 0.1595577597618103,
"learning_rate": 8.576727551515474e-06,
"loss": 0.0156,
"step": 2390
},
{
"epoch": 4.387568555758683,
"grad_norm": 0.12783007323741913,
"learning_rate": 8.093454255994248e-06,
"loss": 0.0122,
"step": 2400
},
{
"epoch": 4.405850091407678,
"grad_norm": 0.26608356833457947,
"learning_rate": 7.6236198258532675e-06,
"loss": 0.0136,
"step": 2410
},
{
"epoch": 4.424131627056672,
"grad_norm": 0.1889527142047882,
"learning_rate": 7.167292961766725e-06,
"loss": 0.015,
"step": 2420
},
{
"epoch": 4.442413162705667,
"grad_norm": 0.2580418884754181,
"learning_rate": 6.724540389289913e-06,
"loss": 0.0132,
"step": 2430
},
{
"epoch": 4.460694698354661,
"grad_norm": 0.22082190215587616,
"learning_rate": 6.295426849102271e-06,
"loss": 0.0113,
"step": 2440
},
{
"epoch": 4.478976234003657,
"grad_norm": 0.11176195740699768,
"learning_rate": 5.8800150875408574e-06,
"loss": 0.0141,
"step": 2450
},
{
"epoch": 4.497257769652651,
"grad_norm": 0.1779015064239502,
"learning_rate": 5.478365847425449e-06,
"loss": 0.0113,
"step": 2460
},
{
"epoch": 4.515539305301646,
"grad_norm": 0.15661382675170898,
"learning_rate": 5.090537859176425e-06,
"loss": 0.0102,
"step": 2470
},
{
"epoch": 4.53382084095064,
"grad_norm": 0.21932142972946167,
"learning_rate": 4.716587832227071e-06,
"loss": 0.0147,
"step": 2480
},
{
"epoch": 4.552102376599635,
"grad_norm": 0.30200353264808655,
"learning_rate": 4.356570446731356e-06,
"loss": 0.0152,
"step": 2490
},
{
"epoch": 4.570383912248629,
"grad_norm": 0.11431296914815903,
"learning_rate": 4.010538345568371e-06,
"loss": 0.017,
"step": 2500
},
{
"epoch": 4.588665447897624,
"grad_norm": 0.2187824845314026,
"learning_rate": 3.678542126644813e-06,
"loss": 0.0168,
"step": 2510
},
{
"epoch": 4.606946983546618,
"grad_norm": 0.12425347417593002,
"learning_rate": 3.360630335496362e-06,
"loss": 0.0113,
"step": 2520
},
{
"epoch": 4.625228519195613,
"grad_norm": 0.17450736463069916,
"learning_rate": 3.056849458189115e-06,
"loss": 0.015,
"step": 2530
},
{
"epoch": 4.643510054844607,
"grad_norm": 0.2220509946346283,
"learning_rate": 2.7672439145223773e-06,
"loss": 0.0196,
"step": 2540
},
{
"epoch": 4.661791590493602,
"grad_norm": 0.2917903959751129,
"learning_rate": 2.491856051533392e-06,
"loss": 0.0165,
"step": 2550
},
{
"epoch": 4.680073126142596,
"grad_norm": 0.22880949079990387,
"learning_rate": 2.230726137305206e-06,
"loss": 0.0165,
"step": 2560
},
{
"epoch": 4.698354661791591,
"grad_norm": 0.2307160645723343,
"learning_rate": 1.983892355078587e-06,
"loss": 0.0129,
"step": 2570
},
{
"epoch": 4.716636197440585,
"grad_norm": 0.1975175142288208,
"learning_rate": 1.7513907976687283e-06,
"loss": 0.016,
"step": 2580
},
{
"epoch": 4.7349177330895795,
"grad_norm": 0.23436793684959412,
"learning_rate": 1.533255462187666e-06,
"loss": 0.0108,
"step": 2590
},
{
"epoch": 4.753199268738574,
"grad_norm": 0.14805355668067932,
"learning_rate": 1.329518245073047e-06,
"loss": 0.0182,
"step": 2600
},
{
"epoch": 4.7714808043875685,
"grad_norm": 0.1988326609134674,
"learning_rate": 1.1402089374242365e-06,
"loss": 0.0119,
"step": 2610
},
{
"epoch": 4.789762340036563,
"grad_norm": 0.12207505851984024,
"learning_rate": 9.65355220646036e-07,
"loss": 0.0128,
"step": 2620
},
{
"epoch": 4.8080438756855575,
"grad_norm": 0.1775001883506775,
"learning_rate": 8.049826624011881e-07,
"loss": 0.0166,
"step": 2630
},
{
"epoch": 4.826325411334552,
"grad_norm": 0.2577812075614929,
"learning_rate": 6.591147128716224e-07,
"loss": 0.0191,
"step": 2640
},
{
"epoch": 4.844606946983546,
"grad_norm": 0.1870380938053131,
"learning_rate": 5.277727013296097e-07,
"loss": 0.0125,
"step": 2650
},
{
"epoch": 4.862888482632541,
"grad_norm": 0.22090613842010498,
"learning_rate": 4.1097583301888954e-07,
"loss": 0.009,
"step": 2660
},
{
"epoch": 4.881170018281535,
"grad_norm": 0.25381821393966675,
"learning_rate": 3.0874118634640626e-07,
"loss": 0.0158,
"step": 2670
},
{
"epoch": 4.89945155393053,
"grad_norm": 0.25577688217163086,
"learning_rate": 2.210837103850949e-07,
"loss": 0.0074,
"step": 2680
},
{
"epoch": 4.917733089579524,
"grad_norm": 0.1378592997789383,
"learning_rate": 1.4801622268791892e-07,
"loss": 0.0104,
"step": 2690
},
{
"epoch": 4.936014625228519,
"grad_norm": 0.1672651469707489,
"learning_rate": 8.954940741369155e-08,
"loss": 0.0126,
"step": 2700
},
{
"epoch": 4.954296160877513,
"grad_norm": 0.10131768137216568,
"learning_rate": 4.5691813764803247e-08,
"loss": 0.0093,
"step": 2710
},
{
"epoch": 4.972577696526509,
"grad_norm": 0.19686748087406158,
"learning_rate": 1.644985473709948e-08,
"loss": 0.0132,
"step": 2720
},
{
"epoch": 4.990859232175502,
"grad_norm": 0.16079658269882202,
"learning_rate": 1.8278061821863646e-09,
"loss": 0.0096,
"step": 2730
},
{
"epoch": 5.0,
"step": 2735,
"total_flos": 9.752547304210464e+16,
"train_loss": 0.045771664261164136,
"train_runtime": 1237.486,
"train_samples_per_second": 35.362,
"train_steps_per_second": 2.21
}
],
"logging_steps": 10,
"max_steps": 2735,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.752547304210464e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}