LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
2f03929 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998269896193771,
"eval_steps": 500,
"global_step": 2889,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01730103806228374,
"grad_norm": 9.043228149414062,
"learning_rate": 1.3793103448275863e-05,
"loss": 1.0259,
"step": 10
},
{
"epoch": 0.03460207612456748,
"grad_norm": 4.801871299743652,
"learning_rate": 2.7586206896551727e-05,
"loss": 0.6145,
"step": 20
},
{
"epoch": 0.05190311418685121,
"grad_norm": 2.854033946990967,
"learning_rate": 4.1379310344827587e-05,
"loss": 0.3438,
"step": 30
},
{
"epoch": 0.06920415224913495,
"grad_norm": 1.3458539247512817,
"learning_rate": 5.517241379310345e-05,
"loss": 0.242,
"step": 40
},
{
"epoch": 0.08650519031141868,
"grad_norm": 2.376776695251465,
"learning_rate": 6.896551724137931e-05,
"loss": 0.2051,
"step": 50
},
{
"epoch": 0.10380622837370242,
"grad_norm": 1.1847227811813354,
"learning_rate": 8.275862068965517e-05,
"loss": 0.1798,
"step": 60
},
{
"epoch": 0.12110726643598616,
"grad_norm": 1.5719020366668701,
"learning_rate": 9.655172413793105e-05,
"loss": 0.1753,
"step": 70
},
{
"epoch": 0.1384083044982699,
"grad_norm": 1.4353362321853638,
"learning_rate": 0.0001103448275862069,
"loss": 0.168,
"step": 80
},
{
"epoch": 0.15570934256055363,
"grad_norm": 1.1386443376541138,
"learning_rate": 0.00012413793103448277,
"loss": 0.1318,
"step": 90
},
{
"epoch": 0.17301038062283736,
"grad_norm": 0.8579152822494507,
"learning_rate": 0.00013793103448275863,
"loss": 0.1337,
"step": 100
},
{
"epoch": 0.1903114186851211,
"grad_norm": 1.252938151359558,
"learning_rate": 0.00015172413793103449,
"loss": 0.1174,
"step": 110
},
{
"epoch": 0.20761245674740483,
"grad_norm": 0.9858207702636719,
"learning_rate": 0.00016551724137931035,
"loss": 0.141,
"step": 120
},
{
"epoch": 0.22491349480968859,
"grad_norm": 1.5159997940063477,
"learning_rate": 0.0001793103448275862,
"loss": 0.1132,
"step": 130
},
{
"epoch": 0.2422145328719723,
"grad_norm": 1.087937593460083,
"learning_rate": 0.0001931034482758621,
"loss": 0.1008,
"step": 140
},
{
"epoch": 0.25951557093425603,
"grad_norm": 1.02494215965271,
"learning_rate": 0.0001999983615229662,
"loss": 0.124,
"step": 150
},
{
"epoch": 0.2768166089965398,
"grad_norm": 1.0136982202529907,
"learning_rate": 0.00019998525402884653,
"loss": 0.0977,
"step": 160
},
{
"epoch": 0.29411764705882354,
"grad_norm": 1.4797533750534058,
"learning_rate": 0.0001999590407586994,
"loss": 0.1054,
"step": 170
},
{
"epoch": 0.31141868512110726,
"grad_norm": 0.7462301254272461,
"learning_rate": 0.000199919725148484,
"loss": 0.0972,
"step": 180
},
{
"epoch": 0.328719723183391,
"grad_norm": 0.9645005464553833,
"learning_rate": 0.00019986731235157592,
"loss": 0.11,
"step": 190
},
{
"epoch": 0.3460207612456747,
"grad_norm": 1.327414631843567,
"learning_rate": 0.00019980180923809214,
"loss": 0.111,
"step": 200
},
{
"epoch": 0.3633217993079585,
"grad_norm": 0.8332083821296692,
"learning_rate": 0.00019972322439399,
"loss": 0.092,
"step": 210
},
{
"epoch": 0.3806228373702422,
"grad_norm": 0.9236223101615906,
"learning_rate": 0.00019963156811994215,
"loss": 0.0983,
"step": 220
},
{
"epoch": 0.39792387543252594,
"grad_norm": 0.8200174570083618,
"learning_rate": 0.0001995268524299861,
"loss": 0.0888,
"step": 230
},
{
"epoch": 0.41522491349480967,
"grad_norm": 0.8660184741020203,
"learning_rate": 0.00019940909104994973,
"loss": 0.0761,
"step": 240
},
{
"epoch": 0.43252595155709345,
"grad_norm": 0.826708972454071,
"learning_rate": 0.00019927829941565186,
"loss": 0.0794,
"step": 250
},
{
"epoch": 0.44982698961937717,
"grad_norm": 0.5673882961273193,
"learning_rate": 0.00019913449467087916,
"loss": 0.0795,
"step": 260
},
{
"epoch": 0.4671280276816609,
"grad_norm": 0.9092473983764648,
"learning_rate": 0.00019897769566513897,
"loss": 0.0779,
"step": 270
},
{
"epoch": 0.4844290657439446,
"grad_norm": 0.8896488547325134,
"learning_rate": 0.00019880792295118852,
"loss": 0.0982,
"step": 280
},
{
"epoch": 0.5017301038062284,
"grad_norm": 1.0210949182510376,
"learning_rate": 0.00019862519878234084,
"loss": 0.0653,
"step": 290
},
{
"epoch": 0.5190311418685121,
"grad_norm": 0.8563092350959778,
"learning_rate": 0.00019842954710954812,
"loss": 0.0807,
"step": 300
},
{
"epoch": 0.5363321799307958,
"grad_norm": 0.9336950182914734,
"learning_rate": 0.000198220993578262,
"loss": 0.073,
"step": 310
},
{
"epoch": 0.5536332179930796,
"grad_norm": 0.6465988159179688,
"learning_rate": 0.00019799956552507233,
"loss": 0.0828,
"step": 320
},
{
"epoch": 0.5709342560553633,
"grad_norm": 1.0124980211257935,
"learning_rate": 0.00019776529197412362,
"loss": 0.0819,
"step": 330
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.7318516373634338,
"learning_rate": 0.00019751820363331097,
"loss": 0.0667,
"step": 340
},
{
"epoch": 0.6055363321799307,
"grad_norm": 0.8519881367683411,
"learning_rate": 0.00019725833289025476,
"loss": 0.0654,
"step": 350
},
{
"epoch": 0.6228373702422145,
"grad_norm": 0.5366441607475281,
"learning_rate": 0.00019698571380805552,
"loss": 0.0726,
"step": 360
},
{
"epoch": 0.6401384083044983,
"grad_norm": 0.570626974105835,
"learning_rate": 0.00019670038212082886,
"loss": 0.0554,
"step": 370
},
{
"epoch": 0.657439446366782,
"grad_norm": 0.7261837720870972,
"learning_rate": 0.00019640237522902174,
"loss": 0.0674,
"step": 380
},
{
"epoch": 0.6747404844290658,
"grad_norm": 0.49231183528900146,
"learning_rate": 0.00019609173219450998,
"loss": 0.0734,
"step": 390
},
{
"epoch": 0.6920415224913494,
"grad_norm": 0.6281108260154724,
"learning_rate": 0.0001957684937354782,
"loss": 0.0582,
"step": 400
},
{
"epoch": 0.7093425605536332,
"grad_norm": 0.6687746644020081,
"learning_rate": 0.00019543270222108268,
"loss": 0.0714,
"step": 410
},
{
"epoch": 0.726643598615917,
"grad_norm": 0.6667928099632263,
"learning_rate": 0.00019508440166589753,
"loss": 0.0792,
"step": 420
},
{
"epoch": 0.7439446366782007,
"grad_norm": 0.7570515871047974,
"learning_rate": 0.00019472363772414563,
"loss": 0.0625,
"step": 430
},
{
"epoch": 0.7612456747404844,
"grad_norm": 0.5553678274154663,
"learning_rate": 0.00019435045768371415,
"loss": 0.063,
"step": 440
},
{
"epoch": 0.7785467128027682,
"grad_norm": 0.6174982190132141,
"learning_rate": 0.00019396491045995648,
"loss": 0.0832,
"step": 450
},
{
"epoch": 0.7958477508650519,
"grad_norm": 0.6104321479797363,
"learning_rate": 0.00019356704658928035,
"loss": 0.0564,
"step": 460
},
{
"epoch": 0.8131487889273357,
"grad_norm": 0.5369688868522644,
"learning_rate": 0.00019315691822252362,
"loss": 0.0647,
"step": 470
},
{
"epoch": 0.8304498269896193,
"grad_norm": 0.6868175268173218,
"learning_rate": 0.0001927345791181187,
"loss": 0.0586,
"step": 480
},
{
"epoch": 0.8477508650519031,
"grad_norm": 0.6508586406707764,
"learning_rate": 0.00019230008463504595,
"loss": 0.0569,
"step": 490
},
{
"epoch": 0.8650519031141869,
"grad_norm": 0.5308678150177002,
"learning_rate": 0.00019185349172557724,
"loss": 0.05,
"step": 500
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.41809916496276855,
"learning_rate": 0.00019139485892781118,
"loss": 0.0528,
"step": 510
},
{
"epoch": 0.8996539792387543,
"grad_norm": 0.6402536630630493,
"learning_rate": 0.00019092424635799962,
"loss": 0.0448,
"step": 520
},
{
"epoch": 0.916955017301038,
"grad_norm": 0.8051866888999939,
"learning_rate": 0.0001904417157026683,
"loss": 0.0567,
"step": 530
},
{
"epoch": 0.9342560553633218,
"grad_norm": 0.4536950886249542,
"learning_rate": 0.00018994733021053076,
"loss": 0.0711,
"step": 540
},
{
"epoch": 0.9515570934256056,
"grad_norm": 0.4057430922985077,
"learning_rate": 0.00018944115468419809,
"loss": 0.0491,
"step": 550
},
{
"epoch": 0.9688581314878892,
"grad_norm": 0.3847835063934326,
"learning_rate": 0.00018892325547168473,
"loss": 0.0563,
"step": 560
},
{
"epoch": 0.986159169550173,
"grad_norm": 0.5546027421951294,
"learning_rate": 0.00018839370045771182,
"loss": 0.0664,
"step": 570
},
{
"epoch": 1.0034602076124568,
"grad_norm": 0.5202723145484924,
"learning_rate": 0.00018785255905480897,
"loss": 0.0581,
"step": 580
},
{
"epoch": 1.0207612456747406,
"grad_norm": 0.729364812374115,
"learning_rate": 0.00018729990219421594,
"loss": 0.0485,
"step": 590
},
{
"epoch": 1.0380622837370241,
"grad_norm": 0.5977576375007629,
"learning_rate": 0.0001867358023165851,
"loss": 0.0545,
"step": 600
},
{
"epoch": 1.055363321799308,
"grad_norm": 0.5301733016967773,
"learning_rate": 0.00018616033336248632,
"loss": 0.0582,
"step": 610
},
{
"epoch": 1.0726643598615917,
"grad_norm": 0.5742247700691223,
"learning_rate": 0.00018557357076271475,
"loss": 0.0655,
"step": 620
},
{
"epoch": 1.0899653979238755,
"grad_norm": 0.7400846481323242,
"learning_rate": 0.0001849755914284039,
"loss": 0.0467,
"step": 630
},
{
"epoch": 1.1072664359861593,
"grad_norm": 0.7476756572723389,
"learning_rate": 0.00018436647374094406,
"loss": 0.0471,
"step": 640
},
{
"epoch": 1.1245674740484428,
"grad_norm": 0.5191195011138916,
"learning_rate": 0.00018374629754170854,
"loss": 0.0585,
"step": 650
},
{
"epoch": 1.1418685121107266,
"grad_norm": 0.711711585521698,
"learning_rate": 0.00018311514412158806,
"loss": 0.0526,
"step": 660
},
{
"epoch": 1.1591695501730104,
"grad_norm": 0.4129495620727539,
"learning_rate": 0.0001824730962103356,
"loss": 0.0436,
"step": 670
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.6265865564346313,
"learning_rate": 0.0001818202379657222,
"loss": 0.0672,
"step": 680
},
{
"epoch": 1.193771626297578,
"grad_norm": 0.5319189429283142,
"learning_rate": 0.0001811566549625061,
"loss": 0.0476,
"step": 690
},
{
"epoch": 1.2110726643598615,
"grad_norm": 0.4006735682487488,
"learning_rate": 0.00018048243418121551,
"loss": 0.0501,
"step": 700
},
{
"epoch": 1.2283737024221453,
"grad_norm": 0.7269253134727478,
"learning_rate": 0.00017979766399674776,
"loss": 0.0548,
"step": 710
},
{
"epoch": 1.245674740484429,
"grad_norm": 0.2751833200454712,
"learning_rate": 0.00017910243416678512,
"loss": 0.0419,
"step": 720
},
{
"epoch": 1.2629757785467128,
"grad_norm": 0.411044180393219,
"learning_rate": 0.00017839683582002982,
"loss": 0.0591,
"step": 730
},
{
"epoch": 1.2802768166089966,
"grad_norm": 0.3086739480495453,
"learning_rate": 0.00017768096144425902,
"loss": 0.041,
"step": 740
},
{
"epoch": 1.2975778546712804,
"grad_norm": 0.49662747979164124,
"learning_rate": 0.00017695490487420194,
"loss": 0.0476,
"step": 750
},
{
"epoch": 1.314878892733564,
"grad_norm": 0.3548310101032257,
"learning_rate": 0.0001762187612792401,
"loss": 0.0427,
"step": 760
},
{
"epoch": 1.3321799307958477,
"grad_norm": 0.5193726420402527,
"learning_rate": 0.00017547262715093291,
"loss": 0.0495,
"step": 770
},
{
"epoch": 1.3494809688581315,
"grad_norm": 0.5085720419883728,
"learning_rate": 0.00017471660029036987,
"loss": 0.0518,
"step": 780
},
{
"epoch": 1.3667820069204153,
"grad_norm": 1.0562312602996826,
"learning_rate": 0.00017395077979535088,
"loss": 0.0414,
"step": 790
},
{
"epoch": 1.3840830449826989,
"grad_norm": 0.3967028558254242,
"learning_rate": 0.00017317526604739708,
"loss": 0.0526,
"step": 800
},
{
"epoch": 1.4013840830449826,
"grad_norm": 0.5379968285560608,
"learning_rate": 0.0001723901606985929,
"loss": 0.0434,
"step": 810
},
{
"epoch": 1.4186851211072664,
"grad_norm": 0.4309251606464386,
"learning_rate": 0.00017159556665826195,
"loss": 0.0653,
"step": 820
},
{
"epoch": 1.4359861591695502,
"grad_norm": 0.5331924557685852,
"learning_rate": 0.0001707915880794778,
"loss": 0.0477,
"step": 830
},
{
"epoch": 1.453287197231834,
"grad_norm": 0.34852275252342224,
"learning_rate": 0.0001699783303454121,
"loss": 0.0403,
"step": 840
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.3794853389263153,
"learning_rate": 0.00016915590005552118,
"loss": 0.0428,
"step": 850
},
{
"epoch": 1.4878892733564013,
"grad_norm": 0.5624513030052185,
"learning_rate": 0.00016832440501157313,
"loss": 0.0466,
"step": 860
},
{
"epoch": 1.505190311418685,
"grad_norm": 0.2839530110359192,
"learning_rate": 0.0001674839542035178,
"loss": 0.0452,
"step": 870
},
{
"epoch": 1.5224913494809689,
"grad_norm": 0.624341607093811,
"learning_rate": 0.0001666346577952004,
"loss": 0.0481,
"step": 880
},
{
"epoch": 1.5397923875432526,
"grad_norm": 0.5866113305091858,
"learning_rate": 0.00016577662710992174,
"loss": 0.058,
"step": 890
},
{
"epoch": 1.5570934256055362,
"grad_norm": 0.5155909061431885,
"learning_rate": 0.00016490997461584617,
"loss": 0.0467,
"step": 900
},
{
"epoch": 1.57439446366782,
"grad_norm": 0.33836182951927185,
"learning_rate": 0.00016403481391125973,
"loss": 0.0434,
"step": 910
},
{
"epoch": 1.5916955017301038,
"grad_norm": 0.3541756570339203,
"learning_rate": 0.00016315125970967978,
"loss": 0.0568,
"step": 920
},
{
"epoch": 1.6089965397923875,
"grad_norm": 0.4802621603012085,
"learning_rate": 0.000162259427824819,
"loss": 0.0616,
"step": 930
},
{
"epoch": 1.6262975778546713,
"grad_norm": 0.3919571340084076,
"learning_rate": 0.00016135943515540455,
"loss": 0.0382,
"step": 940
},
{
"epoch": 1.643598615916955,
"grad_norm": 0.40250062942504883,
"learning_rate": 0.0001604513996698556,
"loss": 0.0446,
"step": 950
},
{
"epoch": 1.6608996539792389,
"grad_norm": 0.3435959219932556,
"learning_rate": 0.00015953544039082012,
"loss": 0.0473,
"step": 960
},
{
"epoch": 1.6782006920415224,
"grad_norm": 0.4662492275238037,
"learning_rate": 0.00015861167737957397,
"loss": 0.0373,
"step": 970
},
{
"epoch": 1.6955017301038062,
"grad_norm": 0.460500031709671,
"learning_rate": 0.00015768023172028342,
"loss": 0.0457,
"step": 980
},
{
"epoch": 1.71280276816609,
"grad_norm": 0.5573172569274902,
"learning_rate": 0.00015674122550413396,
"loss": 0.0421,
"step": 990
},
{
"epoch": 1.7301038062283736,
"grad_norm": 0.32693353295326233,
"learning_rate": 0.00015579478181332684,
"loss": 0.0375,
"step": 1000
},
{
"epoch": 1.7474048442906573,
"grad_norm": 0.3940613865852356,
"learning_rate": 0.00015484102470494576,
"loss": 0.0561,
"step": 1010
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.3881168067455292,
"learning_rate": 0.00015388007919469603,
"loss": 0.0361,
"step": 1020
},
{
"epoch": 1.782006920415225,
"grad_norm": 0.3204861581325531,
"learning_rate": 0.0001529120712405177,
"loss": 0.0375,
"step": 1030
},
{
"epoch": 1.7993079584775087,
"grad_norm": 0.47057387232780457,
"learning_rate": 0.00015193712772607537,
"loss": 0.0382,
"step": 1040
},
{
"epoch": 1.8166089965397925,
"grad_norm": 0.29308807849884033,
"learning_rate": 0.0001509553764441267,
"loss": 0.0331,
"step": 1050
},
{
"epoch": 1.8339100346020762,
"grad_norm": 0.4945255219936371,
"learning_rate": 0.00014996694607977176,
"loss": 0.0477,
"step": 1060
},
{
"epoch": 1.85121107266436,
"grad_norm": 0.3753719925880432,
"learning_rate": 0.00014897196619358526,
"loss": 0.0333,
"step": 1070
},
{
"epoch": 1.8685121107266436,
"grad_norm": 0.4464806914329529,
"learning_rate": 0.0001479705672046341,
"loss": 0.04,
"step": 1080
},
{
"epoch": 1.8858131487889274,
"grad_norm": 0.4522175192832947,
"learning_rate": 0.00014696288037338256,
"loss": 0.0426,
"step": 1090
},
{
"epoch": 1.903114186851211,
"grad_norm": 0.37199148535728455,
"learning_rate": 0.00014594903778448705,
"loss": 0.0432,
"step": 1100
},
{
"epoch": 1.9204152249134947,
"grad_norm": 0.3355126678943634,
"learning_rate": 0.00014492917232948263,
"loss": 0.0356,
"step": 1110
},
{
"epoch": 1.9377162629757785,
"grad_norm": 0.37697604298591614,
"learning_rate": 0.00014390341768936413,
"loss": 0.0359,
"step": 1120
},
{
"epoch": 1.9550173010380623,
"grad_norm": 0.4980825185775757,
"learning_rate": 0.00014287190831706372,
"loss": 0.0374,
"step": 1130
},
{
"epoch": 1.972318339100346,
"grad_norm": 0.3119935691356659,
"learning_rate": 0.00014183477941982704,
"loss": 0.034,
"step": 1140
},
{
"epoch": 1.9896193771626298,
"grad_norm": 0.3332310914993286,
"learning_rate": 0.00014079216694149076,
"loss": 0.0386,
"step": 1150
},
{
"epoch": 2.0069204152249136,
"grad_norm": 0.5753083229064941,
"learning_rate": 0.00013974420754466328,
"loss": 0.0356,
"step": 1160
},
{
"epoch": 2.0242214532871974,
"grad_norm": 0.6476043462753296,
"learning_rate": 0.00013869103859281165,
"loss": 0.0401,
"step": 1170
},
{
"epoch": 2.041522491349481,
"grad_norm": 0.6274977922439575,
"learning_rate": 0.0001376327981322561,
"loss": 0.0405,
"step": 1180
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.5121188163757324,
"learning_rate": 0.0001365696248740756,
"loss": 0.0409,
"step": 1190
},
{
"epoch": 2.0761245674740483,
"grad_norm": 0.38302546739578247,
"learning_rate": 0.0001355016581759257,
"loss": 0.0407,
"step": 1200
},
{
"epoch": 2.093425605536332,
"grad_norm": 0.365703821182251,
"learning_rate": 0.00013442903802377226,
"loss": 0.0439,
"step": 1210
},
{
"epoch": 2.110726643598616,
"grad_norm": 0.43399107456207275,
"learning_rate": 0.00013335190501354227,
"loss": 0.036,
"step": 1220
},
{
"epoch": 2.1280276816608996,
"grad_norm": 0.31331631541252136,
"learning_rate": 0.000132270400332695,
"loss": 0.0303,
"step": 1230
},
{
"epoch": 2.1453287197231834,
"grad_norm": 0.5099217295646667,
"learning_rate": 0.00013118466574171564,
"loss": 0.0339,
"step": 1240
},
{
"epoch": 2.162629757785467,
"grad_norm": 0.39915943145751953,
"learning_rate": 0.00013009484355553364,
"loss": 0.0371,
"step": 1250
},
{
"epoch": 2.179930795847751,
"grad_norm": 0.42217525839805603,
"learning_rate": 0.00012900107662486857,
"loss": 0.0271,
"step": 1260
},
{
"epoch": 2.1972318339100347,
"grad_norm": 0.35948893427848816,
"learning_rate": 0.00012790350831750556,
"loss": 0.0298,
"step": 1270
},
{
"epoch": 2.2145328719723185,
"grad_norm": 0.4188036620616913,
"learning_rate": 0.0001268022824995032,
"loss": 0.0413,
"step": 1280
},
{
"epoch": 2.2318339100346023,
"grad_norm": 0.3547791838645935,
"learning_rate": 0.0001256975435163359,
"loss": 0.0325,
"step": 1290
},
{
"epoch": 2.2491349480968856,
"grad_norm": 0.4535089433193207,
"learning_rate": 0.00012458943617397344,
"loss": 0.0382,
"step": 1300
},
{
"epoch": 2.2664359861591694,
"grad_norm": 0.39500531554222107,
"learning_rate": 0.00012347810571990055,
"loss": 0.0307,
"step": 1310
},
{
"epoch": 2.283737024221453,
"grad_norm": 0.41853851079940796,
"learning_rate": 0.00012236369782407783,
"loss": 0.0313,
"step": 1320
},
{
"epoch": 2.301038062283737,
"grad_norm": 0.3748478591442108,
"learning_rate": 0.0001212463585598481,
"loss": 0.0412,
"step": 1330
},
{
"epoch": 2.3183391003460208,
"grad_norm": 0.41274523735046387,
"learning_rate": 0.00012012623438478931,
"loss": 0.0473,
"step": 1340
},
{
"epoch": 2.3356401384083045,
"grad_norm": 0.24082864820957184,
"learning_rate": 0.0001190034721215176,
"loss": 0.0347,
"step": 1350
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.45620933175086975,
"learning_rate": 0.00011787821893844189,
"loss": 0.0437,
"step": 1360
},
{
"epoch": 2.370242214532872,
"grad_norm": 0.3050684630870819,
"learning_rate": 0.00011675062233047364,
"loss": 0.0369,
"step": 1370
},
{
"epoch": 2.387543252595156,
"grad_norm": 0.42224356532096863,
"learning_rate": 0.00011562083009969366,
"loss": 0.0322,
"step": 1380
},
{
"epoch": 2.404844290657439,
"grad_norm": 0.3712410032749176,
"learning_rate": 0.00011448899033597855,
"loss": 0.0386,
"step": 1390
},
{
"epoch": 2.422145328719723,
"grad_norm": 0.22947420179843903,
"learning_rate": 0.00011335525139758962,
"loss": 0.0375,
"step": 1400
},
{
"epoch": 2.4394463667820068,
"grad_norm": 0.42010223865509033,
"learning_rate": 0.00011221976189172644,
"loss": 0.0404,
"step": 1410
},
{
"epoch": 2.4567474048442905,
"grad_norm": 0.40255504846572876,
"learning_rate": 0.0001110826706550479,
"loss": 0.0356,
"step": 1420
},
{
"epoch": 2.4740484429065743,
"grad_norm": 0.6300822496414185,
"learning_rate": 0.00010994412673416303,
"loss": 0.0403,
"step": 1430
},
{
"epoch": 2.491349480968858,
"grad_norm": 0.4776591658592224,
"learning_rate": 0.00010880427936609455,
"loss": 0.0293,
"step": 1440
},
{
"epoch": 2.508650519031142,
"grad_norm": 0.5055949091911316,
"learning_rate": 0.0001076632779587172,
"loss": 0.039,
"step": 1450
},
{
"epoch": 2.5259515570934257,
"grad_norm": 0.43698614835739136,
"learning_rate": 0.00010652127207117386,
"loss": 0.033,
"step": 1460
},
{
"epoch": 2.5432525951557095,
"grad_norm": 0.4662177264690399,
"learning_rate": 0.00010537841139427178,
"loss": 0.0214,
"step": 1470
},
{
"epoch": 2.5605536332179932,
"grad_norm": 0.28925833106040955,
"learning_rate": 0.00010423484573086138,
"loss": 0.0317,
"step": 1480
},
{
"epoch": 2.577854671280277,
"grad_norm": 0.5237163305282593,
"learning_rate": 0.00010309072497620081,
"loss": 0.0295,
"step": 1490
},
{
"epoch": 2.595155709342561,
"grad_norm": 0.34184929728507996,
"learning_rate": 0.00010194619909830787,
"loss": 0.0389,
"step": 1500
},
{
"epoch": 2.612456747404844,
"grad_norm": 0.3619227111339569,
"learning_rate": 0.00010080141811830277,
"loss": 0.0278,
"step": 1510
},
{
"epoch": 2.629757785467128,
"grad_norm": 0.27774831652641296,
"learning_rate": 9.965653209074378e-05,
"loss": 0.0278,
"step": 1520
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.2855290472507477,
"learning_rate": 9.851169108395842e-05,
"loss": 0.031,
"step": 1530
},
{
"epoch": 2.6643598615916955,
"grad_norm": 0.29521238803863525,
"learning_rate": 9.736704516037317e-05,
"loss": 0.0282,
"step": 1540
},
{
"epoch": 2.6816608996539792,
"grad_norm": 0.30670079588890076,
"learning_rate": 9.622274435684334e-05,
"loss": 0.0376,
"step": 1550
},
{
"epoch": 2.698961937716263,
"grad_norm": 0.2929821014404297,
"learning_rate": 9.507893866498714e-05,
"loss": 0.0261,
"step": 1560
},
{
"epoch": 2.716262975778547,
"grad_norm": 0.23183377087116241,
"learning_rate": 9.393577801152486e-05,
"loss": 0.0253,
"step": 1570
},
{
"epoch": 2.7335640138408306,
"grad_norm": 0.20076750218868256,
"learning_rate": 9.279341223862705e-05,
"loss": 0.0317,
"step": 1580
},
{
"epoch": 2.750865051903114,
"grad_norm": 0.4574607014656067,
"learning_rate": 9.165199108427364e-05,
"loss": 0.0391,
"step": 1590
},
{
"epoch": 2.7681660899653977,
"grad_norm": 0.4383868873119354,
"learning_rate": 9.051166416262673e-05,
"loss": 0.0303,
"step": 1600
},
{
"epoch": 2.7854671280276815,
"grad_norm": 0.4536924362182617,
"learning_rate": 8.937258094441953e-05,
"loss": 0.0309,
"step": 1610
},
{
"epoch": 2.8027681660899653,
"grad_norm": 0.2295350879430771,
"learning_rate": 8.823489073736429e-05,
"loss": 0.0308,
"step": 1620
},
{
"epoch": 2.820069204152249,
"grad_norm": 0.363835871219635,
"learning_rate": 8.70987426665814e-05,
"loss": 0.0353,
"step": 1630
},
{
"epoch": 2.837370242214533,
"grad_norm": 0.15470369160175323,
"learning_rate": 8.596428565505245e-05,
"loss": 0.0261,
"step": 1640
},
{
"epoch": 2.8546712802768166,
"grad_norm": 0.45268356800079346,
"learning_rate": 8.483166840409995e-05,
"loss": 0.0298,
"step": 1650
},
{
"epoch": 2.8719723183391004,
"grad_norm": 0.2832735478878021,
"learning_rate": 8.370103937389595e-05,
"loss": 0.0343,
"step": 1660
},
{
"epoch": 2.889273356401384,
"grad_norm": 0.3091322183609009,
"learning_rate": 8.257254676400237e-05,
"loss": 0.0276,
"step": 1670
},
{
"epoch": 2.906574394463668,
"grad_norm": 0.30403849482536316,
"learning_rate": 8.144633849394527e-05,
"loss": 0.0259,
"step": 1680
},
{
"epoch": 2.9238754325259517,
"grad_norm": 0.41098812222480774,
"learning_rate": 8.032256218382618e-05,
"loss": 0.0416,
"step": 1690
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.23849168419837952,
"learning_rate": 7.920136513497232e-05,
"loss": 0.0285,
"step": 1700
},
{
"epoch": 2.9584775086505193,
"grad_norm": 0.3104817569255829,
"learning_rate": 7.808289431062892e-05,
"loss": 0.026,
"step": 1710
},
{
"epoch": 2.9757785467128026,
"grad_norm": 0.4680814743041992,
"learning_rate": 7.696729631669564e-05,
"loss": 0.0336,
"step": 1720
},
{
"epoch": 2.9930795847750864,
"grad_norm": 0.3355979323387146,
"learning_rate": 7.585471738250984e-05,
"loss": 0.0307,
"step": 1730
},
{
"epoch": 3.01038062283737,
"grad_norm": 0.5686395168304443,
"learning_rate": 7.474530334167935e-05,
"loss": 0.032,
"step": 1740
},
{
"epoch": 3.027681660899654,
"grad_norm": 0.2684016823768616,
"learning_rate": 7.363919961296699e-05,
"loss": 0.0244,
"step": 1750
},
{
"epoch": 3.0449826989619377,
"grad_norm": 0.2455485612154007,
"learning_rate": 7.253655118122948e-05,
"loss": 0.0245,
"step": 1760
},
{
"epoch": 3.0622837370242215,
"grad_norm": 0.35358694195747375,
"learning_rate": 7.143750257841333e-05,
"loss": 0.0267,
"step": 1770
},
{
"epoch": 3.0795847750865053,
"grad_norm": 0.18395821750164032,
"learning_rate": 7.034219786460987e-05,
"loss": 0.023,
"step": 1780
},
{
"epoch": 3.096885813148789,
"grad_norm": 0.3733113408088684,
"learning_rate": 6.925078060917245e-05,
"loss": 0.0276,
"step": 1790
},
{
"epoch": 3.114186851211073,
"grad_norm": 0.30367976427078247,
"learning_rate": 6.816339387189763e-05,
"loss": 0.0298,
"step": 1800
},
{
"epoch": 3.131487889273356,
"grad_norm": 0.3499404489994049,
"learning_rate": 6.708018018427343e-05,
"loss": 0.023,
"step": 1810
},
{
"epoch": 3.14878892733564,
"grad_norm": 0.24544458091259003,
"learning_rate": 6.600128153079661e-05,
"loss": 0.0187,
"step": 1820
},
{
"epoch": 3.1660899653979238,
"grad_norm": 0.2629719376564026,
"learning_rate": 6.492683933036183e-05,
"loss": 0.0244,
"step": 1830
},
{
"epoch": 3.1833910034602075,
"grad_norm": 0.46131864190101624,
"learning_rate": 6.38569944177249e-05,
"loss": 0.0294,
"step": 1840
},
{
"epoch": 3.2006920415224913,
"grad_norm": 0.2149653136730194,
"learning_rate": 6.279188702504252e-05,
"loss": 0.0221,
"step": 1850
},
{
"epoch": 3.217993079584775,
"grad_norm": 0.1689160317182541,
"learning_rate": 6.173165676349103e-05,
"loss": 0.0218,
"step": 1860
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.22369259595870972,
"learning_rate": 6.0676442604966654e-05,
"loss": 0.0181,
"step": 1870
},
{
"epoch": 3.2525951557093427,
"grad_norm": 0.16078820824623108,
"learning_rate": 5.9626382863869414e-05,
"loss": 0.0236,
"step": 1880
},
{
"epoch": 3.2698961937716264,
"grad_norm": 0.36825454235076904,
"learning_rate": 5.8581615178973274e-05,
"loss": 0.0264,
"step": 1890
},
{
"epoch": 3.28719723183391,
"grad_norm": 0.3015175759792328,
"learning_rate": 5.754227649538497e-05,
"loss": 0.0306,
"step": 1900
},
{
"epoch": 3.304498269896194,
"grad_norm": 0.4220731556415558,
"learning_rate": 5.6508503046593484e-05,
"loss": 0.0237,
"step": 1910
},
{
"epoch": 3.3217993079584773,
"grad_norm": 0.4073288142681122,
"learning_rate": 5.548043033661297e-05,
"loss": 0.0164,
"step": 1920
},
{
"epoch": 3.339100346020761,
"grad_norm": 0.27142080664634705,
"learning_rate": 5.44581931222214e-05,
"loss": 0.0255,
"step": 1930
},
{
"epoch": 3.356401384083045,
"grad_norm": 0.25887903571128845,
"learning_rate": 5.3441925395297065e-05,
"loss": 0.0176,
"step": 1940
},
{
"epoch": 3.3737024221453287,
"grad_norm": 0.3199214041233063,
"learning_rate": 5.243176036525499e-05,
"loss": 0.0276,
"step": 1950
},
{
"epoch": 3.3910034602076125,
"grad_norm": 0.24466337263584137,
"learning_rate": 5.142783044158668e-05,
"loss": 0.0232,
"step": 1960
},
{
"epoch": 3.4083044982698962,
"grad_norm": 0.38157567381858826,
"learning_rate": 5.043026721650388e-05,
"loss": 0.0342,
"step": 1970
},
{
"epoch": 3.42560553633218,
"grad_norm": 0.2125539630651474,
"learning_rate": 4.943920144769013e-05,
"loss": 0.0268,
"step": 1980
},
{
"epoch": 3.442906574394464,
"grad_norm": 0.3309932351112366,
"learning_rate": 4.845476304116132e-05,
"loss": 0.0214,
"step": 1990
},
{
"epoch": 3.4602076124567476,
"grad_norm": 0.2618106007575989,
"learning_rate": 4.74770810342379e-05,
"loss": 0.0249,
"step": 2000
},
{
"epoch": 3.477508650519031,
"grad_norm": 0.20525750517845154,
"learning_rate": 4.650628357863113e-05,
"loss": 0.0228,
"step": 2010
},
{
"epoch": 3.4948096885813147,
"grad_norm": 0.2250107228755951,
"learning_rate": 4.5542497923645456e-05,
"loss": 0.0252,
"step": 2020
},
{
"epoch": 3.5121107266435985,
"grad_norm": 0.3091624975204468,
"learning_rate": 4.458585039949874e-05,
"loss": 0.0204,
"step": 2030
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.3369143605232239,
"learning_rate": 4.363646640076355e-05,
"loss": 0.0227,
"step": 2040
},
{
"epoch": 3.546712802768166,
"grad_norm": 0.2998056709766388,
"learning_rate": 4.2694470369930697e-05,
"loss": 0.0191,
"step": 2050
},
{
"epoch": 3.56401384083045,
"grad_norm": 0.1255960911512375,
"learning_rate": 4.175998578109756e-05,
"loss": 0.0225,
"step": 2060
},
{
"epoch": 3.5813148788927336,
"grad_norm": 0.21881087124347687,
"learning_rate": 4.0833135123783683e-05,
"loss": 0.0248,
"step": 2070
},
{
"epoch": 3.5986159169550174,
"grad_norm": 0.20889127254486084,
"learning_rate": 3.991403988687499e-05,
"loss": 0.0249,
"step": 2080
},
{
"epoch": 3.615916955017301,
"grad_norm": 0.2735111713409424,
"learning_rate": 3.900282054269954e-05,
"loss": 0.0207,
"step": 2090
},
{
"epoch": 3.633217993079585,
"grad_norm": 0.3216061294078827,
"learning_rate": 3.8099596531236357e-05,
"loss": 0.0191,
"step": 2100
},
{
"epoch": 3.6505190311418687,
"grad_norm": 0.31375864148139954,
"learning_rate": 3.7204486244459334e-05,
"loss": 0.0261,
"step": 2110
},
{
"epoch": 3.6678200692041525,
"grad_norm": 0.37862274050712585,
"learning_rate": 3.631760701081913e-05,
"loss": 0.028,
"step": 2120
},
{
"epoch": 3.685121107266436,
"grad_norm": 0.34272995591163635,
"learning_rate": 3.5439075079863913e-05,
"loss": 0.0249,
"step": 2130
},
{
"epoch": 3.7024221453287196,
"grad_norm": 0.35570189356803894,
"learning_rate": 3.456900560700158e-05,
"loss": 0.019,
"step": 2140
},
{
"epoch": 3.7197231833910034,
"grad_norm": 0.2902262806892395,
"learning_rate": 3.370751263840581e-05,
"loss": 0.0202,
"step": 2150
},
{
"epoch": 3.737024221453287,
"grad_norm": 0.1935553103685379,
"learning_rate": 3.285470909606696e-05,
"loss": 0.026,
"step": 2160
},
{
"epoch": 3.754325259515571,
"grad_norm": 0.1905568242073059,
"learning_rate": 3.2010706762990736e-05,
"loss": 0.0172,
"step": 2170
},
{
"epoch": 3.7716262975778547,
"grad_norm": 0.27930760383605957,
"learning_rate": 3.117561626854601e-05,
"loss": 0.0149,
"step": 2180
},
{
"epoch": 3.7889273356401385,
"grad_norm": 0.38801103830337524,
"learning_rate": 3.0349547073963693e-05,
"loss": 0.0202,
"step": 2190
},
{
"epoch": 3.8062283737024223,
"grad_norm": 0.171518012881279,
"learning_rate": 2.953260745798898e-05,
"loss": 0.0226,
"step": 2200
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.22121591866016388,
"learning_rate": 2.8724904502688566e-05,
"loss": 0.022,
"step": 2210
},
{
"epoch": 3.8408304498269894,
"grad_norm": 0.36183586716651917,
"learning_rate": 2.792654407941444e-05,
"loss": 0.0207,
"step": 2220
},
{
"epoch": 3.858131487889273,
"grad_norm": 0.49139508605003357,
"learning_rate": 2.7137630834926788e-05,
"loss": 0.0237,
"step": 2230
},
{
"epoch": 3.875432525951557,
"grad_norm": 0.29921260476112366,
"learning_rate": 2.635826817767708e-05,
"loss": 0.021,
"step": 2240
},
{
"epoch": 3.8927335640138407,
"grad_norm": 0.2761327922344208,
"learning_rate": 2.5588558264253547e-05,
"loss": 0.0183,
"step": 2250
},
{
"epoch": 3.9100346020761245,
"grad_norm": 0.21798908710479736,
"learning_rate": 2.4828601985990983e-05,
"loss": 0.0301,
"step": 2260
},
{
"epoch": 3.9273356401384083,
"grad_norm": 0.20128929615020752,
"learning_rate": 2.407849895574592e-05,
"loss": 0.0263,
"step": 2270
},
{
"epoch": 3.944636678200692,
"grad_norm": 0.3022063672542572,
"learning_rate": 2.3338347494839997e-05,
"loss": 0.0249,
"step": 2280
},
{
"epoch": 3.961937716262976,
"grad_norm": 0.13671119511127472,
"learning_rate": 2.260824462017195e-05,
"loss": 0.0189,
"step": 2290
},
{
"epoch": 3.9792387543252596,
"grad_norm": 0.217565655708313,
"learning_rate": 2.1888286031501216e-05,
"loss": 0.022,
"step": 2300
},
{
"epoch": 3.9965397923875434,
"grad_norm": 0.335618793964386,
"learning_rate": 2.1178566098903674e-05,
"loss": 0.0188,
"step": 2310
},
{
"epoch": 4.013840830449827,
"grad_norm": 0.22030693292617798,
"learning_rate": 2.047917785040202e-05,
"loss": 0.0179,
"step": 2320
},
{
"epoch": 4.031141868512111,
"grad_norm": 0.23955483734607697,
"learning_rate": 1.9790212959771815e-05,
"loss": 0.0194,
"step": 2330
},
{
"epoch": 4.048442906574395,
"grad_norm": 0.4007752239704132,
"learning_rate": 1.911176173452529e-05,
"loss": 0.0213,
"step": 2340
},
{
"epoch": 4.0657439446366785,
"grad_norm": 0.41095274686813354,
"learning_rate": 1.8443913104073983e-05,
"loss": 0.0227,
"step": 2350
},
{
"epoch": 4.083044982698962,
"grad_norm": 0.29689717292785645,
"learning_rate": 1.7786754608072154e-05,
"loss": 0.0182,
"step": 2360
},
{
"epoch": 4.100346020761246,
"grad_norm": 0.4345749616622925,
"learning_rate": 1.7140372384942427e-05,
"loss": 0.0261,
"step": 2370
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.2638370990753174,
"learning_rate": 1.6504851160584854e-05,
"loss": 0.0141,
"step": 2380
},
{
"epoch": 4.134948096885813,
"grad_norm": 0.271502822637558,
"learning_rate": 1.5880274237271442e-05,
"loss": 0.0238,
"step": 2390
},
{
"epoch": 4.1522491349480966,
"grad_norm": 0.32822224497795105,
"learning_rate": 1.5266723482727075e-05,
"loss": 0.0169,
"step": 2400
},
{
"epoch": 4.16955017301038,
"grad_norm": 0.37470826506614685,
"learning_rate": 1.4664279319398566e-05,
"loss": 0.0189,
"step": 2410
},
{
"epoch": 4.186851211072664,
"grad_norm": 0.25935235619544983,
"learning_rate": 1.4073020713912987e-05,
"loss": 0.0232,
"step": 2420
},
{
"epoch": 4.204152249134948,
"grad_norm": 0.26056811213493347,
"learning_rate": 1.349302516672717e-05,
"loss": 0.022,
"step": 2430
},
{
"epoch": 4.221453287197232,
"grad_norm": 0.2688888609409332,
"learning_rate": 1.2924368701968936e-05,
"loss": 0.0165,
"step": 2440
},
{
"epoch": 4.2387543252595155,
"grad_norm": 0.2515370547771454,
"learning_rate": 1.2367125857472283e-05,
"loss": 0.0262,
"step": 2450
},
{
"epoch": 4.256055363321799,
"grad_norm": 0.22485879063606262,
"learning_rate": 1.1821369675007076e-05,
"loss": 0.0215,
"step": 2460
},
{
"epoch": 4.273356401384083,
"grad_norm": 0.16641457378864288,
"learning_rate": 1.1287171690704923e-05,
"loss": 0.0168,
"step": 2470
},
{
"epoch": 4.290657439446367,
"grad_norm": 0.20574532449245453,
"learning_rate": 1.076460192568246e-05,
"loss": 0.0191,
"step": 2480
},
{
"epoch": 4.307958477508651,
"grad_norm": 0.22523482143878937,
"learning_rate": 1.0253728876863255e-05,
"loss": 0.0228,
"step": 2490
},
{
"epoch": 4.325259515570934,
"grad_norm": 0.23874153196811676,
"learning_rate": 9.754619507999286e-06,
"loss": 0.0243,
"step": 2500
},
{
"epoch": 4.342560553633218,
"grad_norm": 0.19232115149497986,
"learning_rate": 9.26733924089369e-06,
"loss": 0.0192,
"step": 2510
},
{
"epoch": 4.359861591695502,
"grad_norm": 0.21156880259513855,
"learning_rate": 8.791951946825305e-06,
"loss": 0.0167,
"step": 2520
},
{
"epoch": 4.377162629757786,
"grad_norm": 0.13566535711288452,
"learning_rate": 8.328519938176737e-06,
"loss": 0.0159,
"step": 2530
},
{
"epoch": 4.3944636678200695,
"grad_norm": 0.23292118310928345,
"learning_rate": 7.877103960266574e-06,
"loss": 0.0233,
"step": 2540
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.31070488691329956,
"learning_rate": 7.437763183387048e-06,
"loss": 0.0186,
"step": 2550
},
{
"epoch": 4.429065743944637,
"grad_norm": 0.47433632612228394,
"learning_rate": 7.010555195048241e-06,
"loss": 0.0257,
"step": 2560
},
{
"epoch": 4.446366782006921,
"grad_norm": 0.1894678920507431,
"learning_rate": 6.59553599242958e-06,
"loss": 0.0189,
"step": 2570
},
{
"epoch": 4.463667820069205,
"grad_norm": 0.24350449442863464,
"learning_rate": 6.1927599750399634e-06,
"loss": 0.0213,
"step": 2580
},
{
"epoch": 4.4809688581314875,
"grad_norm": 0.198005810379982,
"learning_rate": 5.802279937587218e-06,
"loss": 0.015,
"step": 2590
},
{
"epoch": 4.498269896193771,
"grad_norm": 0.1783621609210968,
"learning_rate": 5.424147063057938e-06,
"loss": 0.0225,
"step": 2600
},
{
"epoch": 4.515570934256055,
"grad_norm": 0.1628614068031311,
"learning_rate": 5.058410916008494e-06,
"loss": 0.0163,
"step": 2610
},
{
"epoch": 4.532871972318339,
"grad_norm": 0.28581690788269043,
"learning_rate": 4.70511943606835e-06,
"loss": 0.0161,
"step": 2620
},
{
"epoch": 4.550173010380623,
"grad_norm": 0.12549340724945068,
"learning_rate": 4.364318931656186e-06,
"loss": 0.0187,
"step": 2630
},
{
"epoch": 4.567474048442906,
"grad_norm": 0.23058100044727325,
"learning_rate": 4.0360540739100335e-06,
"loss": 0.0234,
"step": 2640
},
{
"epoch": 4.58477508650519,
"grad_norm": 0.21442987024784088,
"learning_rate": 3.7203678908318327e-06,
"loss": 0.0215,
"step": 2650
},
{
"epoch": 4.602076124567474,
"grad_norm": 0.15471522510051727,
"learning_rate": 3.417301761647429e-06,
"loss": 0.0145,
"step": 2660
},
{
"epoch": 4.619377162629758,
"grad_norm": 0.3764342963695526,
"learning_rate": 3.1268954113827798e-06,
"loss": 0.0256,
"step": 2670
},
{
"epoch": 4.6366782006920415,
"grad_norm": 0.32330504059791565,
"learning_rate": 2.8491869056568643e-06,
"loss": 0.0196,
"step": 2680
},
{
"epoch": 4.653979238754325,
"grad_norm": 0.27203860878944397,
"learning_rate": 2.5842126456921633e-06,
"loss": 0.0157,
"step": 2690
},
{
"epoch": 4.671280276816609,
"grad_norm": 0.1730179786682129,
"learning_rate": 2.3320073635432984e-06,
"loss": 0.0184,
"step": 2700
},
{
"epoch": 4.688581314878893,
"grad_norm": 0.2550993263721466,
"learning_rate": 2.092604117544461e-06,
"loss": 0.0146,
"step": 2710
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.23976610600948334,
"learning_rate": 1.8660342879761817e-06,
"loss": 0.0198,
"step": 2720
},
{
"epoch": 4.72318339100346,
"grad_norm": 0.1788170337677002,
"learning_rate": 1.6523275729521615e-06,
"loss": 0.012,
"step": 2730
},
{
"epoch": 4.740484429065744,
"grad_norm": 0.16593609750270844,
"learning_rate": 1.4515119845264658e-06,
"loss": 0.0118,
"step": 2740
},
{
"epoch": 4.757785467128028,
"grad_norm": 0.2375115007162094,
"learning_rate": 1.2636138450218382e-06,
"loss": 0.0186,
"step": 2750
},
{
"epoch": 4.775086505190312,
"grad_norm": 0.163288876414299,
"learning_rate": 1.0886577835793831e-06,
"loss": 0.0156,
"step": 2760
},
{
"epoch": 4.7923875432525955,
"grad_norm": 0.19594237208366394,
"learning_rate": 9.26666732930348e-07,
"loss": 0.014,
"step": 2770
},
{
"epoch": 4.809688581314878,
"grad_norm": 0.09780792146921158,
"learning_rate": 7.776619263900387e-07,
"loss": 0.0152,
"step": 2780
},
{
"epoch": 4.826989619377162,
"grad_norm": 0.17942482233047485,
"learning_rate": 6.416628950747461e-07,
"loss": 0.0183,
"step": 2790
},
{
"epoch": 4.844290657439446,
"grad_norm": 0.34686434268951416,
"learning_rate": 5.186874653415718e-07,
"loss": 0.0188,
"step": 2800
},
{
"epoch": 4.86159169550173,
"grad_norm": 0.1715351790189743,
"learning_rate": 4.087517564518528e-07,
"loss": 0.0139,
"step": 2810
},
{
"epoch": 4.8788927335640135,
"grad_norm": 0.21105673909187317,
"learning_rate": 3.1187017845827337e-07,
"loss": 0.0162,
"step": 2820
},
{
"epoch": 4.896193771626297,
"grad_norm": 0.2553834319114685,
"learning_rate": 2.2805543031604314e-07,
"loss": 0.0241,
"step": 2830
},
{
"epoch": 4.913494809688581,
"grad_norm": 0.2684378921985626,
"learning_rate": 1.5731849821833954e-07,
"loss": 0.0203,
"step": 2840
},
{
"epoch": 4.930795847750865,
"grad_norm": 0.2061266154050827,
"learning_rate": 9.966865415631521e-08,
"loss": 0.0195,
"step": 2850
},
{
"epoch": 4.948096885813149,
"grad_norm": 0.19463296234607697,
"learning_rate": 5.5113454703692445e-08,
"loss": 0.0146,
"step": 2860
},
{
"epoch": 4.965397923875432,
"grad_norm": 0.27894893288612366,
"learning_rate": 2.3658740026311077e-08,
"loss": 0.0225,
"step": 2870
},
{
"epoch": 4.982698961937716,
"grad_norm": 0.26400962471961975,
"learning_rate": 5.3086331166074535e-09,
"loss": 0.0168,
"step": 2880
},
{
"epoch": 4.998269896193771,
"step": 2889,
"total_flos": 1.012715054916768e+17,
"train_loss": 0.04862415902999991,
"train_runtime": 1280.7015,
"train_samples_per_second": 36.093,
"train_steps_per_second": 2.256
}
],
"logging_steps": 10,
"max_steps": 2889,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.012715054916768e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}