LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
6bef978 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998269896193771,
"eval_steps": 500,
"global_step": 2889,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01730103806228374,
"grad_norm": 7.182409286499023,
"learning_rate": 1.3793103448275863e-05,
"loss": 0.9301,
"step": 10
},
{
"epoch": 0.03460207612456748,
"grad_norm": 5.011022090911865,
"learning_rate": 2.7586206896551727e-05,
"loss": 0.6074,
"step": 20
},
{
"epoch": 0.05190311418685121,
"grad_norm": 2.866461992263794,
"learning_rate": 4.1379310344827587e-05,
"loss": 0.3194,
"step": 30
},
{
"epoch": 0.06920415224913495,
"grad_norm": 1.375663161277771,
"learning_rate": 5.517241379310345e-05,
"loss": 0.2251,
"step": 40
},
{
"epoch": 0.08650519031141868,
"grad_norm": 2.0214180946350098,
"learning_rate": 6.896551724137931e-05,
"loss": 0.1793,
"step": 50
},
{
"epoch": 0.10380622837370242,
"grad_norm": 1.3289449214935303,
"learning_rate": 8.275862068965517e-05,
"loss": 0.1596,
"step": 60
},
{
"epoch": 0.12110726643598616,
"grad_norm": 1.1882271766662598,
"learning_rate": 9.655172413793105e-05,
"loss": 0.1548,
"step": 70
},
{
"epoch": 0.1384083044982699,
"grad_norm": 1.2069551944732666,
"learning_rate": 0.0001103448275862069,
"loss": 0.1448,
"step": 80
},
{
"epoch": 0.15570934256055363,
"grad_norm": 0.7576473355293274,
"learning_rate": 0.00012413793103448277,
"loss": 0.119,
"step": 90
},
{
"epoch": 0.17301038062283736,
"grad_norm": 1.3533903360366821,
"learning_rate": 0.00013793103448275863,
"loss": 0.1164,
"step": 100
},
{
"epoch": 0.1903114186851211,
"grad_norm": 1.136509656906128,
"learning_rate": 0.00015172413793103449,
"loss": 0.1099,
"step": 110
},
{
"epoch": 0.20761245674740483,
"grad_norm": 1.2311946153640747,
"learning_rate": 0.00016551724137931035,
"loss": 0.1293,
"step": 120
},
{
"epoch": 0.22491349480968859,
"grad_norm": 1.382206678390503,
"learning_rate": 0.0001793103448275862,
"loss": 0.1049,
"step": 130
},
{
"epoch": 0.2422145328719723,
"grad_norm": 1.087380290031433,
"learning_rate": 0.0001931034482758621,
"loss": 0.1011,
"step": 140
},
{
"epoch": 0.25951557093425603,
"grad_norm": 1.564366340637207,
"learning_rate": 0.0001999983615229662,
"loss": 0.1222,
"step": 150
},
{
"epoch": 0.2768166089965398,
"grad_norm": 0.7164430618286133,
"learning_rate": 0.00019998525402884653,
"loss": 0.1028,
"step": 160
},
{
"epoch": 0.29411764705882354,
"grad_norm": 0.96863853931427,
"learning_rate": 0.0001999590407586994,
"loss": 0.0966,
"step": 170
},
{
"epoch": 0.31141868512110726,
"grad_norm": 0.5991038084030151,
"learning_rate": 0.000199919725148484,
"loss": 0.0912,
"step": 180
},
{
"epoch": 0.328719723183391,
"grad_norm": 0.7860184907913208,
"learning_rate": 0.00019986731235157592,
"loss": 0.0987,
"step": 190
},
{
"epoch": 0.3460207612456747,
"grad_norm": 1.4145984649658203,
"learning_rate": 0.00019980180923809214,
"loss": 0.1068,
"step": 200
},
{
"epoch": 0.3633217993079585,
"grad_norm": 0.9435092806816101,
"learning_rate": 0.00019972322439399,
"loss": 0.0907,
"step": 210
},
{
"epoch": 0.3806228373702422,
"grad_norm": 0.521682858467102,
"learning_rate": 0.00019963156811994215,
"loss": 0.0955,
"step": 220
},
{
"epoch": 0.39792387543252594,
"grad_norm": 0.5504732131958008,
"learning_rate": 0.0001995268524299861,
"loss": 0.0821,
"step": 230
},
{
"epoch": 0.41522491349480967,
"grad_norm": 0.867936909198761,
"learning_rate": 0.00019940909104994973,
"loss": 0.073,
"step": 240
},
{
"epoch": 0.43252595155709345,
"grad_norm": 0.9123047590255737,
"learning_rate": 0.00019927829941565186,
"loss": 0.0746,
"step": 250
},
{
"epoch": 0.44982698961937717,
"grad_norm": 0.5221778750419617,
"learning_rate": 0.00019913449467087916,
"loss": 0.0718,
"step": 260
},
{
"epoch": 0.4671280276816609,
"grad_norm": 0.7056018114089966,
"learning_rate": 0.00019897769566513897,
"loss": 0.0649,
"step": 270
},
{
"epoch": 0.4844290657439446,
"grad_norm": 0.8671623468399048,
"learning_rate": 0.00019880792295118852,
"loss": 0.085,
"step": 280
},
{
"epoch": 0.5017301038062284,
"grad_norm": 0.9353134632110596,
"learning_rate": 0.00019862519878234084,
"loss": 0.0635,
"step": 290
},
{
"epoch": 0.5190311418685121,
"grad_norm": 0.669700026512146,
"learning_rate": 0.00019842954710954812,
"loss": 0.0704,
"step": 300
},
{
"epoch": 0.5363321799307958,
"grad_norm": 0.6092180013656616,
"learning_rate": 0.000198220993578262,
"loss": 0.0616,
"step": 310
},
{
"epoch": 0.5536332179930796,
"grad_norm": 0.44123342633247375,
"learning_rate": 0.00019799956552507233,
"loss": 0.0746,
"step": 320
},
{
"epoch": 0.5709342560553633,
"grad_norm": 0.9491882920265198,
"learning_rate": 0.00019776529197412362,
"loss": 0.0814,
"step": 330
},
{
"epoch": 0.5882352941176471,
"grad_norm": 0.7369952201843262,
"learning_rate": 0.00019751820363331097,
"loss": 0.0629,
"step": 340
},
{
"epoch": 0.6055363321799307,
"grad_norm": 0.5006421208381653,
"learning_rate": 0.00019725833289025476,
"loss": 0.0616,
"step": 350
},
{
"epoch": 0.6228373702422145,
"grad_norm": 0.7486262917518616,
"learning_rate": 0.00019698571380805552,
"loss": 0.0658,
"step": 360
},
{
"epoch": 0.6401384083044983,
"grad_norm": 0.4780927896499634,
"learning_rate": 0.00019670038212082886,
"loss": 0.0492,
"step": 370
},
{
"epoch": 0.657439446366782,
"grad_norm": 0.6117326617240906,
"learning_rate": 0.00019640237522902174,
"loss": 0.0603,
"step": 380
},
{
"epoch": 0.6747404844290658,
"grad_norm": 0.4989817440509796,
"learning_rate": 0.00019609173219450998,
"loss": 0.0635,
"step": 390
},
{
"epoch": 0.6920415224913494,
"grad_norm": 0.32066667079925537,
"learning_rate": 0.0001957684937354782,
"loss": 0.0462,
"step": 400
},
{
"epoch": 0.7093425605536332,
"grad_norm": 0.7751433253288269,
"learning_rate": 0.00019543270222108268,
"loss": 0.0556,
"step": 410
},
{
"epoch": 0.726643598615917,
"grad_norm": 0.4977259635925293,
"learning_rate": 0.00019508440166589753,
"loss": 0.0651,
"step": 420
},
{
"epoch": 0.7439446366782007,
"grad_norm": 0.6962701082229614,
"learning_rate": 0.00019472363772414563,
"loss": 0.0554,
"step": 430
},
{
"epoch": 0.7612456747404844,
"grad_norm": 0.5899763107299805,
"learning_rate": 0.00019435045768371415,
"loss": 0.0536,
"step": 440
},
{
"epoch": 0.7785467128027682,
"grad_norm": 0.6964747905731201,
"learning_rate": 0.00019396491045995648,
"loss": 0.0677,
"step": 450
},
{
"epoch": 0.7958477508650519,
"grad_norm": 0.59555584192276,
"learning_rate": 0.00019356704658928035,
"loss": 0.0515,
"step": 460
},
{
"epoch": 0.8131487889273357,
"grad_norm": 0.5397696495056152,
"learning_rate": 0.00019315691822252362,
"loss": 0.0587,
"step": 470
},
{
"epoch": 0.8304498269896193,
"grad_norm": 0.4341135025024414,
"learning_rate": 0.0001927345791181187,
"loss": 0.0512,
"step": 480
},
{
"epoch": 0.8477508650519031,
"grad_norm": 0.3942558467388153,
"learning_rate": 0.00019230008463504595,
"loss": 0.0494,
"step": 490
},
{
"epoch": 0.8650519031141869,
"grad_norm": 0.47972944378852844,
"learning_rate": 0.00019185349172557724,
"loss": 0.0439,
"step": 500
},
{
"epoch": 0.8823529411764706,
"grad_norm": 0.6018040180206299,
"learning_rate": 0.00019139485892781118,
"loss": 0.05,
"step": 510
},
{
"epoch": 0.8996539792387543,
"grad_norm": 0.7354676127433777,
"learning_rate": 0.00019092424635799962,
"loss": 0.0418,
"step": 520
},
{
"epoch": 0.916955017301038,
"grad_norm": 0.5309942960739136,
"learning_rate": 0.0001904417157026683,
"loss": 0.0474,
"step": 530
},
{
"epoch": 0.9342560553633218,
"grad_norm": 0.4427996873855591,
"learning_rate": 0.00018994733021053076,
"loss": 0.0615,
"step": 540
},
{
"epoch": 0.9515570934256056,
"grad_norm": 0.47377070784568787,
"learning_rate": 0.00018944115468419809,
"loss": 0.0426,
"step": 550
},
{
"epoch": 0.9688581314878892,
"grad_norm": 0.4655269682407379,
"learning_rate": 0.00018892325547168473,
"loss": 0.0511,
"step": 560
},
{
"epoch": 0.986159169550173,
"grad_norm": 0.6078908443450928,
"learning_rate": 0.00018839370045771182,
"loss": 0.061,
"step": 570
},
{
"epoch": 1.0034602076124568,
"grad_norm": 0.6679680943489075,
"learning_rate": 0.00018785255905480897,
"loss": 0.0539,
"step": 580
},
{
"epoch": 1.0207612456747406,
"grad_norm": 0.752947986125946,
"learning_rate": 0.00018729990219421594,
"loss": 0.0415,
"step": 590
},
{
"epoch": 1.0380622837370241,
"grad_norm": 0.7484295964241028,
"learning_rate": 0.0001867358023165851,
"loss": 0.0506,
"step": 600
},
{
"epoch": 1.055363321799308,
"grad_norm": 0.6325497031211853,
"learning_rate": 0.00018616033336248632,
"loss": 0.0521,
"step": 610
},
{
"epoch": 1.0726643598615917,
"grad_norm": 0.6303852796554565,
"learning_rate": 0.00018557357076271475,
"loss": 0.0566,
"step": 620
},
{
"epoch": 1.0899653979238755,
"grad_norm": 0.49704891443252563,
"learning_rate": 0.0001849755914284039,
"loss": 0.0432,
"step": 630
},
{
"epoch": 1.1072664359861593,
"grad_norm": 0.5623544454574585,
"learning_rate": 0.00018436647374094406,
"loss": 0.0377,
"step": 640
},
{
"epoch": 1.1245674740484428,
"grad_norm": 0.35646283626556396,
"learning_rate": 0.00018374629754170854,
"loss": 0.0477,
"step": 650
},
{
"epoch": 1.1418685121107266,
"grad_norm": 0.5910556316375732,
"learning_rate": 0.00018311514412158806,
"loss": 0.0447,
"step": 660
},
{
"epoch": 1.1591695501730104,
"grad_norm": 0.6531932353973389,
"learning_rate": 0.0001824730962103356,
"loss": 0.0397,
"step": 670
},
{
"epoch": 1.1764705882352942,
"grad_norm": 0.5327192544937134,
"learning_rate": 0.0001818202379657222,
"loss": 0.0609,
"step": 680
},
{
"epoch": 1.193771626297578,
"grad_norm": 0.5592876076698303,
"learning_rate": 0.0001811566549625061,
"loss": 0.0445,
"step": 690
},
{
"epoch": 1.2110726643598615,
"grad_norm": 0.5012805461883545,
"learning_rate": 0.00018048243418121551,
"loss": 0.0467,
"step": 700
},
{
"epoch": 1.2283737024221453,
"grad_norm": 0.6008819341659546,
"learning_rate": 0.00017979766399674776,
"loss": 0.0475,
"step": 710
},
{
"epoch": 1.245674740484429,
"grad_norm": 0.3577163815498352,
"learning_rate": 0.00017910243416678512,
"loss": 0.0407,
"step": 720
},
{
"epoch": 1.2629757785467128,
"grad_norm": 0.3694828152656555,
"learning_rate": 0.00017839683582002982,
"loss": 0.0498,
"step": 730
},
{
"epoch": 1.2802768166089966,
"grad_norm": 0.3278524577617645,
"learning_rate": 0.00017768096144425902,
"loss": 0.0392,
"step": 740
},
{
"epoch": 1.2975778546712804,
"grad_norm": 0.5506361126899719,
"learning_rate": 0.00017695490487420194,
"loss": 0.043,
"step": 750
},
{
"epoch": 1.314878892733564,
"grad_norm": 0.3072315752506256,
"learning_rate": 0.0001762187612792401,
"loss": 0.0379,
"step": 760
},
{
"epoch": 1.3321799307958477,
"grad_norm": 0.6110448837280273,
"learning_rate": 0.00017547262715093291,
"loss": 0.0475,
"step": 770
},
{
"epoch": 1.3494809688581315,
"grad_norm": 0.3272629678249359,
"learning_rate": 0.00017471660029036987,
"loss": 0.0471,
"step": 780
},
{
"epoch": 1.3667820069204153,
"grad_norm": 0.7174297571182251,
"learning_rate": 0.00017395077979535088,
"loss": 0.0386,
"step": 790
},
{
"epoch": 1.3840830449826989,
"grad_norm": 0.5086835026741028,
"learning_rate": 0.00017317526604739708,
"loss": 0.0508,
"step": 800
},
{
"epoch": 1.4013840830449826,
"grad_norm": 0.3599720597267151,
"learning_rate": 0.0001723901606985929,
"loss": 0.0377,
"step": 810
},
{
"epoch": 1.4186851211072664,
"grad_norm": 0.4809829294681549,
"learning_rate": 0.00017159556665826195,
"loss": 0.0575,
"step": 820
},
{
"epoch": 1.4359861591695502,
"grad_norm": 0.5327945947647095,
"learning_rate": 0.0001707915880794778,
"loss": 0.0427,
"step": 830
},
{
"epoch": 1.453287197231834,
"grad_norm": 0.6132643818855286,
"learning_rate": 0.0001699783303454121,
"loss": 0.0379,
"step": 840
},
{
"epoch": 1.4705882352941178,
"grad_norm": 0.44660812616348267,
"learning_rate": 0.00016915590005552118,
"loss": 0.0395,
"step": 850
},
{
"epoch": 1.4878892733564013,
"grad_norm": 0.7750672698020935,
"learning_rate": 0.00016832440501157313,
"loss": 0.0433,
"step": 860
},
{
"epoch": 1.505190311418685,
"grad_norm": 0.3744714856147766,
"learning_rate": 0.0001674839542035178,
"loss": 0.0398,
"step": 870
},
{
"epoch": 1.5224913494809689,
"grad_norm": 0.4120178818702698,
"learning_rate": 0.0001666346577952004,
"loss": 0.0393,
"step": 880
},
{
"epoch": 1.5397923875432526,
"grad_norm": 0.5227100849151611,
"learning_rate": 0.00016577662710992174,
"loss": 0.0441,
"step": 890
},
{
"epoch": 1.5570934256055362,
"grad_norm": 0.6994874477386475,
"learning_rate": 0.00016490997461584617,
"loss": 0.0388,
"step": 900
},
{
"epoch": 1.57439446366782,
"grad_norm": 0.5372064113616943,
"learning_rate": 0.00016403481391125973,
"loss": 0.0379,
"step": 910
},
{
"epoch": 1.5916955017301038,
"grad_norm": 0.3524293601512909,
"learning_rate": 0.00016315125970967978,
"loss": 0.0477,
"step": 920
},
{
"epoch": 1.6089965397923875,
"grad_norm": 0.40851327776908875,
"learning_rate": 0.000162259427824819,
"loss": 0.0522,
"step": 930
},
{
"epoch": 1.6262975778546713,
"grad_norm": 0.3339862525463104,
"learning_rate": 0.00016135943515540455,
"loss": 0.0341,
"step": 940
},
{
"epoch": 1.643598615916955,
"grad_norm": 0.5174134373664856,
"learning_rate": 0.0001604513996698556,
"loss": 0.0402,
"step": 950
},
{
"epoch": 1.6608996539792389,
"grad_norm": 0.3898767828941345,
"learning_rate": 0.00015953544039082012,
"loss": 0.0451,
"step": 960
},
{
"epoch": 1.6782006920415224,
"grad_norm": 0.5794402956962585,
"learning_rate": 0.00015861167737957397,
"loss": 0.0327,
"step": 970
},
{
"epoch": 1.6955017301038062,
"grad_norm": 0.42949485778808594,
"learning_rate": 0.00015768023172028342,
"loss": 0.0478,
"step": 980
},
{
"epoch": 1.71280276816609,
"grad_norm": 0.48415741324424744,
"learning_rate": 0.00015674122550413396,
"loss": 0.0389,
"step": 990
},
{
"epoch": 1.7301038062283736,
"grad_norm": 0.4169400632381439,
"learning_rate": 0.00015579478181332684,
"loss": 0.0373,
"step": 1000
},
{
"epoch": 1.7474048442906573,
"grad_norm": 0.30931493639945984,
"learning_rate": 0.00015484102470494576,
"loss": 0.0498,
"step": 1010
},
{
"epoch": 1.7647058823529411,
"grad_norm": 0.4326338469982147,
"learning_rate": 0.00015388007919469603,
"loss": 0.033,
"step": 1020
},
{
"epoch": 1.782006920415225,
"grad_norm": 0.37110018730163574,
"learning_rate": 0.0001529120712405177,
"loss": 0.0391,
"step": 1030
},
{
"epoch": 1.7993079584775087,
"grad_norm": 0.33888983726501465,
"learning_rate": 0.00015193712772607537,
"loss": 0.036,
"step": 1040
},
{
"epoch": 1.8166089965397925,
"grad_norm": 0.2806644141674042,
"learning_rate": 0.0001509553764441267,
"loss": 0.0308,
"step": 1050
},
{
"epoch": 1.8339100346020762,
"grad_norm": 0.3501754105091095,
"learning_rate": 0.00014996694607977176,
"loss": 0.0415,
"step": 1060
},
{
"epoch": 1.85121107266436,
"grad_norm": 0.37800803780555725,
"learning_rate": 0.00014897196619358526,
"loss": 0.032,
"step": 1070
},
{
"epoch": 1.8685121107266436,
"grad_norm": 0.4112405478954315,
"learning_rate": 0.0001479705672046341,
"loss": 0.0362,
"step": 1080
},
{
"epoch": 1.8858131487889274,
"grad_norm": 0.42961543798446655,
"learning_rate": 0.00014696288037338256,
"loss": 0.0408,
"step": 1090
},
{
"epoch": 1.903114186851211,
"grad_norm": 0.22437059879302979,
"learning_rate": 0.00014594903778448705,
"loss": 0.0375,
"step": 1100
},
{
"epoch": 1.9204152249134947,
"grad_norm": 0.3606659770011902,
"learning_rate": 0.00014492917232948263,
"loss": 0.0309,
"step": 1110
},
{
"epoch": 1.9377162629757785,
"grad_norm": 0.45117369294166565,
"learning_rate": 0.00014390341768936413,
"loss": 0.0319,
"step": 1120
},
{
"epoch": 1.9550173010380623,
"grad_norm": 0.4739942252635956,
"learning_rate": 0.00014287190831706372,
"loss": 0.032,
"step": 1130
},
{
"epoch": 1.972318339100346,
"grad_norm": 0.3133346140384674,
"learning_rate": 0.00014183477941982704,
"loss": 0.0279,
"step": 1140
},
{
"epoch": 1.9896193771626298,
"grad_norm": 0.3473515212535858,
"learning_rate": 0.00014079216694149076,
"loss": 0.0326,
"step": 1150
},
{
"epoch": 2.0069204152249136,
"grad_norm": 0.3447270095348358,
"learning_rate": 0.00013974420754466328,
"loss": 0.0291,
"step": 1160
},
{
"epoch": 2.0242214532871974,
"grad_norm": 0.36638471484184265,
"learning_rate": 0.00013869103859281165,
"loss": 0.0336,
"step": 1170
},
{
"epoch": 2.041522491349481,
"grad_norm": 0.558734118938446,
"learning_rate": 0.0001376327981322561,
"loss": 0.0345,
"step": 1180
},
{
"epoch": 2.0588235294117645,
"grad_norm": 0.44841858744621277,
"learning_rate": 0.0001365696248740756,
"loss": 0.0377,
"step": 1190
},
{
"epoch": 2.0761245674740483,
"grad_norm": 0.6627900004386902,
"learning_rate": 0.0001355016581759257,
"loss": 0.0394,
"step": 1200
},
{
"epoch": 2.093425605536332,
"grad_norm": 0.4195360541343689,
"learning_rate": 0.00013442903802377226,
"loss": 0.0387,
"step": 1210
},
{
"epoch": 2.110726643598616,
"grad_norm": 0.4721520245075226,
"learning_rate": 0.00013335190501354227,
"loss": 0.0363,
"step": 1220
},
{
"epoch": 2.1280276816608996,
"grad_norm": 0.3159089684486389,
"learning_rate": 0.000132270400332695,
"loss": 0.03,
"step": 1230
},
{
"epoch": 2.1453287197231834,
"grad_norm": 0.43079933524131775,
"learning_rate": 0.00013118466574171564,
"loss": 0.0324,
"step": 1240
},
{
"epoch": 2.162629757785467,
"grad_norm": 0.4005158841609955,
"learning_rate": 0.00013009484355553364,
"loss": 0.0351,
"step": 1250
},
{
"epoch": 2.179930795847751,
"grad_norm": 0.5328086614608765,
"learning_rate": 0.00012900107662486857,
"loss": 0.0255,
"step": 1260
},
{
"epoch": 2.1972318339100347,
"grad_norm": 0.5238893628120422,
"learning_rate": 0.00012790350831750556,
"loss": 0.0281,
"step": 1270
},
{
"epoch": 2.2145328719723185,
"grad_norm": 0.5240843892097473,
"learning_rate": 0.0001268022824995032,
"loss": 0.0368,
"step": 1280
},
{
"epoch": 2.2318339100346023,
"grad_norm": 0.2866148054599762,
"learning_rate": 0.0001256975435163359,
"loss": 0.0295,
"step": 1290
},
{
"epoch": 2.2491349480968856,
"grad_norm": 0.46996742486953735,
"learning_rate": 0.00012458943617397344,
"loss": 0.0331,
"step": 1300
},
{
"epoch": 2.2664359861591694,
"grad_norm": 0.36567744612693787,
"learning_rate": 0.00012347810571990055,
"loss": 0.0285,
"step": 1310
},
{
"epoch": 2.283737024221453,
"grad_norm": 0.4469076693058014,
"learning_rate": 0.00012236369782407783,
"loss": 0.0256,
"step": 1320
},
{
"epoch": 2.301038062283737,
"grad_norm": 0.5179305076599121,
"learning_rate": 0.0001212463585598481,
"loss": 0.0367,
"step": 1330
},
{
"epoch": 2.3183391003460208,
"grad_norm": 0.4950549602508545,
"learning_rate": 0.00012012623438478931,
"loss": 0.0368,
"step": 1340
},
{
"epoch": 2.3356401384083045,
"grad_norm": 0.23521831631660461,
"learning_rate": 0.0001190034721215176,
"loss": 0.0341,
"step": 1350
},
{
"epoch": 2.3529411764705883,
"grad_norm": 0.44627803564071655,
"learning_rate": 0.00011787821893844189,
"loss": 0.0366,
"step": 1360
},
{
"epoch": 2.370242214532872,
"grad_norm": 0.4281276762485504,
"learning_rate": 0.00011675062233047364,
"loss": 0.0341,
"step": 1370
},
{
"epoch": 2.387543252595156,
"grad_norm": 0.5153332948684692,
"learning_rate": 0.00011562083009969366,
"loss": 0.0302,
"step": 1380
},
{
"epoch": 2.404844290657439,
"grad_norm": 0.3502303957939148,
"learning_rate": 0.00011448899033597855,
"loss": 0.0372,
"step": 1390
},
{
"epoch": 2.422145328719723,
"grad_norm": 0.20930013060569763,
"learning_rate": 0.00011335525139758962,
"loss": 0.032,
"step": 1400
},
{
"epoch": 2.4394463667820068,
"grad_norm": 0.40936240553855896,
"learning_rate": 0.00011221976189172644,
"loss": 0.0369,
"step": 1410
},
{
"epoch": 2.4567474048442905,
"grad_norm": 0.33430635929107666,
"learning_rate": 0.0001110826706550479,
"loss": 0.0329,
"step": 1420
},
{
"epoch": 2.4740484429065743,
"grad_norm": 0.5178155303001404,
"learning_rate": 0.00010994412673416303,
"loss": 0.0326,
"step": 1430
},
{
"epoch": 2.491349480968858,
"grad_norm": 0.5524935722351074,
"learning_rate": 0.00010880427936609455,
"loss": 0.0259,
"step": 1440
},
{
"epoch": 2.508650519031142,
"grad_norm": 0.32262513041496277,
"learning_rate": 0.0001076632779587172,
"loss": 0.0338,
"step": 1450
},
{
"epoch": 2.5259515570934257,
"grad_norm": 0.5853790640830994,
"learning_rate": 0.00010652127207117386,
"loss": 0.0309,
"step": 1460
},
{
"epoch": 2.5432525951557095,
"grad_norm": 0.45327532291412354,
"learning_rate": 0.00010537841139427178,
"loss": 0.0194,
"step": 1470
},
{
"epoch": 2.5605536332179932,
"grad_norm": 0.319289892911911,
"learning_rate": 0.00010423484573086138,
"loss": 0.028,
"step": 1480
},
{
"epoch": 2.577854671280277,
"grad_norm": 0.5092198848724365,
"learning_rate": 0.00010309072497620081,
"loss": 0.0267,
"step": 1490
},
{
"epoch": 2.595155709342561,
"grad_norm": 0.29407837986946106,
"learning_rate": 0.00010194619909830787,
"loss": 0.0345,
"step": 1500
},
{
"epoch": 2.612456747404844,
"grad_norm": 0.5686324238777161,
"learning_rate": 0.00010080141811830277,
"loss": 0.026,
"step": 1510
},
{
"epoch": 2.629757785467128,
"grad_norm": 0.3031514585018158,
"learning_rate": 9.965653209074378e-05,
"loss": 0.027,
"step": 1520
},
{
"epoch": 2.6470588235294117,
"grad_norm": 0.29397931694984436,
"learning_rate": 9.851169108395842e-05,
"loss": 0.0284,
"step": 1530
},
{
"epoch": 2.6643598615916955,
"grad_norm": 0.2728932201862335,
"learning_rate": 9.736704516037317e-05,
"loss": 0.0244,
"step": 1540
},
{
"epoch": 2.6816608996539792,
"grad_norm": 0.23780933022499084,
"learning_rate": 9.622274435684334e-05,
"loss": 0.0359,
"step": 1550
},
{
"epoch": 2.698961937716263,
"grad_norm": 0.3995797336101532,
"learning_rate": 9.507893866498714e-05,
"loss": 0.023,
"step": 1560
},
{
"epoch": 2.716262975778547,
"grad_norm": 0.2033807337284088,
"learning_rate": 9.393577801152486e-05,
"loss": 0.0232,
"step": 1570
},
{
"epoch": 2.7335640138408306,
"grad_norm": 0.1671728491783142,
"learning_rate": 9.279341223862705e-05,
"loss": 0.027,
"step": 1580
},
{
"epoch": 2.750865051903114,
"grad_norm": 0.3328063189983368,
"learning_rate": 9.165199108427364e-05,
"loss": 0.0308,
"step": 1590
},
{
"epoch": 2.7681660899653977,
"grad_norm": 0.5018717646598816,
"learning_rate": 9.051166416262673e-05,
"loss": 0.0266,
"step": 1600
},
{
"epoch": 2.7854671280276815,
"grad_norm": 0.44454413652420044,
"learning_rate": 8.937258094441953e-05,
"loss": 0.0264,
"step": 1610
},
{
"epoch": 2.8027681660899653,
"grad_norm": 0.29583296179771423,
"learning_rate": 8.823489073736429e-05,
"loss": 0.029,
"step": 1620
},
{
"epoch": 2.820069204152249,
"grad_norm": 0.2867840826511383,
"learning_rate": 8.70987426665814e-05,
"loss": 0.031,
"step": 1630
},
{
"epoch": 2.837370242214533,
"grad_norm": 0.17943017184734344,
"learning_rate": 8.596428565505245e-05,
"loss": 0.0235,
"step": 1640
},
{
"epoch": 2.8546712802768166,
"grad_norm": 0.3584960699081421,
"learning_rate": 8.483166840409995e-05,
"loss": 0.0255,
"step": 1650
},
{
"epoch": 2.8719723183391004,
"grad_norm": 0.2367885261774063,
"learning_rate": 8.370103937389595e-05,
"loss": 0.0297,
"step": 1660
},
{
"epoch": 2.889273356401384,
"grad_norm": 0.3012569546699524,
"learning_rate": 8.257254676400237e-05,
"loss": 0.0241,
"step": 1670
},
{
"epoch": 2.906574394463668,
"grad_norm": 0.3492811322212219,
"learning_rate": 8.144633849394527e-05,
"loss": 0.0245,
"step": 1680
},
{
"epoch": 2.9238754325259517,
"grad_norm": 0.5457948446273804,
"learning_rate": 8.032256218382618e-05,
"loss": 0.0417,
"step": 1690
},
{
"epoch": 2.9411764705882355,
"grad_norm": 0.26424387097358704,
"learning_rate": 7.920136513497232e-05,
"loss": 0.0275,
"step": 1700
},
{
"epoch": 2.9584775086505193,
"grad_norm": 0.27352163195610046,
"learning_rate": 7.808289431062892e-05,
"loss": 0.0256,
"step": 1710
},
{
"epoch": 2.9757785467128026,
"grad_norm": 0.47394421696662903,
"learning_rate": 7.696729631669564e-05,
"loss": 0.0287,
"step": 1720
},
{
"epoch": 2.9930795847750864,
"grad_norm": 0.2788919508457184,
"learning_rate": 7.585471738250984e-05,
"loss": 0.0258,
"step": 1730
},
{
"epoch": 3.01038062283737,
"grad_norm": 0.46854037046432495,
"learning_rate": 7.474530334167935e-05,
"loss": 0.0268,
"step": 1740
},
{
"epoch": 3.027681660899654,
"grad_norm": 0.3487790822982788,
"learning_rate": 7.363919961296699e-05,
"loss": 0.0207,
"step": 1750
},
{
"epoch": 3.0449826989619377,
"grad_norm": 0.3000977337360382,
"learning_rate": 7.253655118122948e-05,
"loss": 0.0229,
"step": 1760
},
{
"epoch": 3.0622837370242215,
"grad_norm": 0.2896054685115814,
"learning_rate": 7.143750257841333e-05,
"loss": 0.0224,
"step": 1770
},
{
"epoch": 3.0795847750865053,
"grad_norm": 0.24790829420089722,
"learning_rate": 7.034219786460987e-05,
"loss": 0.021,
"step": 1780
},
{
"epoch": 3.096885813148789,
"grad_norm": 0.3710263967514038,
"learning_rate": 6.925078060917245e-05,
"loss": 0.0244,
"step": 1790
},
{
"epoch": 3.114186851211073,
"grad_norm": 0.28580549359321594,
"learning_rate": 6.816339387189763e-05,
"loss": 0.0298,
"step": 1800
},
{
"epoch": 3.131487889273356,
"grad_norm": 0.2744482457637787,
"learning_rate": 6.708018018427343e-05,
"loss": 0.0199,
"step": 1810
},
{
"epoch": 3.14878892733564,
"grad_norm": 0.26155564188957214,
"learning_rate": 6.600128153079661e-05,
"loss": 0.0166,
"step": 1820
},
{
"epoch": 3.1660899653979238,
"grad_norm": 0.2101258784532547,
"learning_rate": 6.492683933036183e-05,
"loss": 0.0229,
"step": 1830
},
{
"epoch": 3.1833910034602075,
"grad_norm": 0.37706953287124634,
"learning_rate": 6.38569944177249e-05,
"loss": 0.0271,
"step": 1840
},
{
"epoch": 3.2006920415224913,
"grad_norm": 0.18298964202404022,
"learning_rate": 6.279188702504252e-05,
"loss": 0.0204,
"step": 1850
},
{
"epoch": 3.217993079584775,
"grad_norm": 0.23048189282417297,
"learning_rate": 6.173165676349103e-05,
"loss": 0.0202,
"step": 1860
},
{
"epoch": 3.235294117647059,
"grad_norm": 0.25672975182533264,
"learning_rate": 6.0676442604966654e-05,
"loss": 0.0154,
"step": 1870
},
{
"epoch": 3.2525951557093427,
"grad_norm": 0.1890515834093094,
"learning_rate": 5.9626382863869414e-05,
"loss": 0.0255,
"step": 1880
},
{
"epoch": 3.2698961937716264,
"grad_norm": 0.41655805706977844,
"learning_rate": 5.8581615178973274e-05,
"loss": 0.023,
"step": 1890
},
{
"epoch": 3.28719723183391,
"grad_norm": 0.302661269903183,
"learning_rate": 5.754227649538497e-05,
"loss": 0.0277,
"step": 1900
},
{
"epoch": 3.304498269896194,
"grad_norm": 0.5044915676116943,
"learning_rate": 5.6508503046593484e-05,
"loss": 0.0192,
"step": 1910
},
{
"epoch": 3.3217993079584773,
"grad_norm": 0.32608142495155334,
"learning_rate": 5.548043033661297e-05,
"loss": 0.016,
"step": 1920
},
{
"epoch": 3.339100346020761,
"grad_norm": 0.2057291716337204,
"learning_rate": 5.44581931222214e-05,
"loss": 0.0225,
"step": 1930
},
{
"epoch": 3.356401384083045,
"grad_norm": 0.36772650480270386,
"learning_rate": 5.3441925395297065e-05,
"loss": 0.0152,
"step": 1940
},
{
"epoch": 3.3737024221453287,
"grad_norm": 0.25624701380729675,
"learning_rate": 5.243176036525499e-05,
"loss": 0.0244,
"step": 1950
},
{
"epoch": 3.3910034602076125,
"grad_norm": 0.2545239329338074,
"learning_rate": 5.142783044158668e-05,
"loss": 0.019,
"step": 1960
},
{
"epoch": 3.4083044982698962,
"grad_norm": 0.3163670003414154,
"learning_rate": 5.043026721650388e-05,
"loss": 0.0284,
"step": 1970
},
{
"epoch": 3.42560553633218,
"grad_norm": 0.18014021217823029,
"learning_rate": 4.943920144769013e-05,
"loss": 0.0245,
"step": 1980
},
{
"epoch": 3.442906574394464,
"grad_norm": 0.3292304575443268,
"learning_rate": 4.845476304116132e-05,
"loss": 0.0198,
"step": 1990
},
{
"epoch": 3.4602076124567476,
"grad_norm": 0.30157071352005005,
"learning_rate": 4.74770810342379e-05,
"loss": 0.0215,
"step": 2000
},
{
"epoch": 3.477508650519031,
"grad_norm": 0.1187843605875969,
"learning_rate": 4.650628357863113e-05,
"loss": 0.0209,
"step": 2010
},
{
"epoch": 3.4948096885813147,
"grad_norm": 0.17467384040355682,
"learning_rate": 4.5542497923645456e-05,
"loss": 0.0176,
"step": 2020
},
{
"epoch": 3.5121107266435985,
"grad_norm": 0.3263590335845947,
"learning_rate": 4.458585039949874e-05,
"loss": 0.019,
"step": 2030
},
{
"epoch": 3.5294117647058822,
"grad_norm": 0.3304053544998169,
"learning_rate": 4.363646640076355e-05,
"loss": 0.0197,
"step": 2040
},
{
"epoch": 3.546712802768166,
"grad_norm": 0.23609571158885956,
"learning_rate": 4.2694470369930697e-05,
"loss": 0.0173,
"step": 2050
},
{
"epoch": 3.56401384083045,
"grad_norm": 0.13446663320064545,
"learning_rate": 4.175998578109756e-05,
"loss": 0.019,
"step": 2060
},
{
"epoch": 3.5813148788927336,
"grad_norm": 0.23198455572128296,
"learning_rate": 4.0833135123783683e-05,
"loss": 0.0226,
"step": 2070
},
{
"epoch": 3.5986159169550174,
"grad_norm": 0.2981939911842346,
"learning_rate": 3.991403988687499e-05,
"loss": 0.0203,
"step": 2080
},
{
"epoch": 3.615916955017301,
"grad_norm": 0.18665547668933868,
"learning_rate": 3.900282054269954e-05,
"loss": 0.0199,
"step": 2090
},
{
"epoch": 3.633217993079585,
"grad_norm": 0.25699618458747864,
"learning_rate": 3.8099596531236357e-05,
"loss": 0.0163,
"step": 2100
},
{
"epoch": 3.6505190311418687,
"grad_norm": 0.35227668285369873,
"learning_rate": 3.7204486244459334e-05,
"loss": 0.022,
"step": 2110
},
{
"epoch": 3.6678200692041525,
"grad_norm": 0.4024980664253235,
"learning_rate": 3.631760701081913e-05,
"loss": 0.0278,
"step": 2120
},
{
"epoch": 3.685121107266436,
"grad_norm": 0.37001362442970276,
"learning_rate": 3.5439075079863913e-05,
"loss": 0.0243,
"step": 2130
},
{
"epoch": 3.7024221453287196,
"grad_norm": 0.3014512360095978,
"learning_rate": 3.456900560700158e-05,
"loss": 0.0154,
"step": 2140
},
{
"epoch": 3.7197231833910034,
"grad_norm": 0.24404117465019226,
"learning_rate": 3.370751263840581e-05,
"loss": 0.0165,
"step": 2150
},
{
"epoch": 3.737024221453287,
"grad_norm": 0.16157107055187225,
"learning_rate": 3.285470909606696e-05,
"loss": 0.0214,
"step": 2160
},
{
"epoch": 3.754325259515571,
"grad_norm": 0.2038198709487915,
"learning_rate": 3.2010706762990736e-05,
"loss": 0.0146,
"step": 2170
},
{
"epoch": 3.7716262975778547,
"grad_norm": 0.21232356131076813,
"learning_rate": 3.117561626854601e-05,
"loss": 0.0127,
"step": 2180
},
{
"epoch": 3.7889273356401385,
"grad_norm": 0.32062217593193054,
"learning_rate": 3.0349547073963693e-05,
"loss": 0.0193,
"step": 2190
},
{
"epoch": 3.8062283737024223,
"grad_norm": 0.1892966628074646,
"learning_rate": 2.953260745798898e-05,
"loss": 0.0196,
"step": 2200
},
{
"epoch": 3.8235294117647056,
"grad_norm": 0.23055952787399292,
"learning_rate": 2.8724904502688566e-05,
"loss": 0.0197,
"step": 2210
},
{
"epoch": 3.8408304498269894,
"grad_norm": 0.25780200958251953,
"learning_rate": 2.792654407941444e-05,
"loss": 0.0186,
"step": 2220
},
{
"epoch": 3.858131487889273,
"grad_norm": 0.4138200283050537,
"learning_rate": 2.7137630834926788e-05,
"loss": 0.0207,
"step": 2230
},
{
"epoch": 3.875432525951557,
"grad_norm": 0.31212252378463745,
"learning_rate": 2.635826817767708e-05,
"loss": 0.0194,
"step": 2240
},
{
"epoch": 3.8927335640138407,
"grad_norm": 0.23651407659053802,
"learning_rate": 2.5588558264253547e-05,
"loss": 0.0159,
"step": 2250
},
{
"epoch": 3.9100346020761245,
"grad_norm": 0.1716080605983734,
"learning_rate": 2.4828601985990983e-05,
"loss": 0.0277,
"step": 2260
},
{
"epoch": 3.9273356401384083,
"grad_norm": 0.2992190420627594,
"learning_rate": 2.407849895574592e-05,
"loss": 0.0229,
"step": 2270
},
{
"epoch": 3.944636678200692,
"grad_norm": 0.32485488057136536,
"learning_rate": 2.3338347494839997e-05,
"loss": 0.024,
"step": 2280
},
{
"epoch": 3.961937716262976,
"grad_norm": 0.15837271511554718,
"learning_rate": 2.260824462017195e-05,
"loss": 0.0179,
"step": 2290
},
{
"epoch": 3.9792387543252596,
"grad_norm": 0.18397711217403412,
"learning_rate": 2.1888286031501216e-05,
"loss": 0.0195,
"step": 2300
},
{
"epoch": 3.9965397923875434,
"grad_norm": 0.30589598417282104,
"learning_rate": 2.1178566098903674e-05,
"loss": 0.0165,
"step": 2310
},
{
"epoch": 4.013840830449827,
"grad_norm": 0.20331960916519165,
"learning_rate": 2.047917785040202e-05,
"loss": 0.0173,
"step": 2320
},
{
"epoch": 4.031141868512111,
"grad_norm": 0.17372752726078033,
"learning_rate": 1.9790212959771815e-05,
"loss": 0.0153,
"step": 2330
},
{
"epoch": 4.048442906574395,
"grad_norm": 0.45032989978790283,
"learning_rate": 1.911176173452529e-05,
"loss": 0.0183,
"step": 2340
},
{
"epoch": 4.0657439446366785,
"grad_norm": 0.381754994392395,
"learning_rate": 1.8443913104073983e-05,
"loss": 0.0191,
"step": 2350
},
{
"epoch": 4.083044982698962,
"grad_norm": 0.2898407578468323,
"learning_rate": 1.7786754608072154e-05,
"loss": 0.0164,
"step": 2360
},
{
"epoch": 4.100346020761246,
"grad_norm": 0.32434627413749695,
"learning_rate": 1.7140372384942427e-05,
"loss": 0.0202,
"step": 2370
},
{
"epoch": 4.117647058823529,
"grad_norm": 0.19729329645633698,
"learning_rate": 1.6504851160584854e-05,
"loss": 0.0135,
"step": 2380
},
{
"epoch": 4.134948096885813,
"grad_norm": 0.18434198200702667,
"learning_rate": 1.5880274237271442e-05,
"loss": 0.0204,
"step": 2390
},
{
"epoch": 4.1522491349480966,
"grad_norm": 0.2019858956336975,
"learning_rate": 1.5266723482727075e-05,
"loss": 0.0131,
"step": 2400
},
{
"epoch": 4.16955017301038,
"grad_norm": 0.2418513149023056,
"learning_rate": 1.4664279319398566e-05,
"loss": 0.0151,
"step": 2410
},
{
"epoch": 4.186851211072664,
"grad_norm": 0.1929568648338318,
"learning_rate": 1.4073020713912987e-05,
"loss": 0.0201,
"step": 2420
},
{
"epoch": 4.204152249134948,
"grad_norm": 0.20280393958091736,
"learning_rate": 1.349302516672717e-05,
"loss": 0.0184,
"step": 2430
},
{
"epoch": 4.221453287197232,
"grad_norm": 0.11360491067171097,
"learning_rate": 1.2924368701968936e-05,
"loss": 0.0135,
"step": 2440
},
{
"epoch": 4.2387543252595155,
"grad_norm": 0.20200875401496887,
"learning_rate": 1.2367125857472283e-05,
"loss": 0.0224,
"step": 2450
},
{
"epoch": 4.256055363321799,
"grad_norm": 0.1830940693616867,
"learning_rate": 1.1821369675007076e-05,
"loss": 0.0186,
"step": 2460
},
{
"epoch": 4.273356401384083,
"grad_norm": 0.21644911170005798,
"learning_rate": 1.1287171690704923e-05,
"loss": 0.0142,
"step": 2470
},
{
"epoch": 4.290657439446367,
"grad_norm": 0.2183384746313095,
"learning_rate": 1.076460192568246e-05,
"loss": 0.0154,
"step": 2480
},
{
"epoch": 4.307958477508651,
"grad_norm": 0.2756160795688629,
"learning_rate": 1.0253728876863255e-05,
"loss": 0.0218,
"step": 2490
},
{
"epoch": 4.325259515570934,
"grad_norm": 0.23770609498023987,
"learning_rate": 9.754619507999286e-06,
"loss": 0.0217,
"step": 2500
},
{
"epoch": 4.342560553633218,
"grad_norm": 0.2103499174118042,
"learning_rate": 9.26733924089369e-06,
"loss": 0.0161,
"step": 2510
},
{
"epoch": 4.359861591695502,
"grad_norm": 0.13652902841567993,
"learning_rate": 8.791951946825305e-06,
"loss": 0.0136,
"step": 2520
},
{
"epoch": 4.377162629757786,
"grad_norm": 0.15833500027656555,
"learning_rate": 8.328519938176737e-06,
"loss": 0.0137,
"step": 2530
},
{
"epoch": 4.3944636678200695,
"grad_norm": 0.251105934381485,
"learning_rate": 7.877103960266574e-06,
"loss": 0.0207,
"step": 2540
},
{
"epoch": 4.411764705882353,
"grad_norm": 0.2944389581680298,
"learning_rate": 7.437763183387048e-06,
"loss": 0.0164,
"step": 2550
},
{
"epoch": 4.429065743944637,
"grad_norm": 0.40459296107292175,
"learning_rate": 7.010555195048241e-06,
"loss": 0.0223,
"step": 2560
},
{
"epoch": 4.446366782006921,
"grad_norm": 0.211450457572937,
"learning_rate": 6.59553599242958e-06,
"loss": 0.0169,
"step": 2570
},
{
"epoch": 4.463667820069205,
"grad_norm": 0.22154732048511505,
"learning_rate": 6.1927599750399634e-06,
"loss": 0.0185,
"step": 2580
},
{
"epoch": 4.4809688581314875,
"grad_norm": 0.15479212999343872,
"learning_rate": 5.802279937587218e-06,
"loss": 0.0121,
"step": 2590
},
{
"epoch": 4.498269896193771,
"grad_norm": 0.16314175724983215,
"learning_rate": 5.424147063057938e-06,
"loss": 0.0208,
"step": 2600
},
{
"epoch": 4.515570934256055,
"grad_norm": 0.20242264866828918,
"learning_rate": 5.058410916008494e-06,
"loss": 0.0152,
"step": 2610
},
{
"epoch": 4.532871972318339,
"grad_norm": 0.2115008533000946,
"learning_rate": 4.70511943606835e-06,
"loss": 0.0156,
"step": 2620
},
{
"epoch": 4.550173010380623,
"grad_norm": 0.10006527602672577,
"learning_rate": 4.364318931656186e-06,
"loss": 0.0168,
"step": 2630
},
{
"epoch": 4.567474048442906,
"grad_norm": 0.2213827222585678,
"learning_rate": 4.0360540739100335e-06,
"loss": 0.0191,
"step": 2640
},
{
"epoch": 4.58477508650519,
"grad_norm": 0.27389973402023315,
"learning_rate": 3.7203678908318327e-06,
"loss": 0.0188,
"step": 2650
},
{
"epoch": 4.602076124567474,
"grad_norm": 0.18938276171684265,
"learning_rate": 3.417301761647429e-06,
"loss": 0.0118,
"step": 2660
},
{
"epoch": 4.619377162629758,
"grad_norm": 0.36289745569229126,
"learning_rate": 3.1268954113827798e-06,
"loss": 0.0224,
"step": 2670
},
{
"epoch": 4.6366782006920415,
"grad_norm": 0.32742252945899963,
"learning_rate": 2.8491869056568643e-06,
"loss": 0.0163,
"step": 2680
},
{
"epoch": 4.653979238754325,
"grad_norm": 0.15712793171405792,
"learning_rate": 2.5842126456921633e-06,
"loss": 0.0141,
"step": 2690
},
{
"epoch": 4.671280276816609,
"grad_norm": 0.18424645066261292,
"learning_rate": 2.3320073635432984e-06,
"loss": 0.0152,
"step": 2700
},
{
"epoch": 4.688581314878893,
"grad_norm": 0.2283022552728653,
"learning_rate": 2.092604117544461e-06,
"loss": 0.0125,
"step": 2710
},
{
"epoch": 4.705882352941177,
"grad_norm": 0.19734638929367065,
"learning_rate": 1.8660342879761817e-06,
"loss": 0.0165,
"step": 2720
},
{
"epoch": 4.72318339100346,
"grad_norm": 0.2148875743150711,
"learning_rate": 1.6523275729521615e-06,
"loss": 0.0105,
"step": 2730
},
{
"epoch": 4.740484429065744,
"grad_norm": 0.20573090016841888,
"learning_rate": 1.4515119845264658e-06,
"loss": 0.011,
"step": 2740
},
{
"epoch": 4.757785467128028,
"grad_norm": 0.24224159121513367,
"learning_rate": 1.2636138450218382e-06,
"loss": 0.0172,
"step": 2750
},
{
"epoch": 4.775086505190312,
"grad_norm": 0.10488854348659515,
"learning_rate": 1.0886577835793831e-06,
"loss": 0.014,
"step": 2760
},
{
"epoch": 4.7923875432525955,
"grad_norm": 0.16592086851596832,
"learning_rate": 9.26666732930348e-07,
"loss": 0.0114,
"step": 2770
},
{
"epoch": 4.809688581314878,
"grad_norm": 0.12251131981611252,
"learning_rate": 7.776619263900387e-07,
"loss": 0.0135,
"step": 2780
},
{
"epoch": 4.826989619377162,
"grad_norm": 0.16953104734420776,
"learning_rate": 6.416628950747461e-07,
"loss": 0.0156,
"step": 2790
},
{
"epoch": 4.844290657439446,
"grad_norm": 0.3001477122306824,
"learning_rate": 5.186874653415718e-07,
"loss": 0.0165,
"step": 2800
},
{
"epoch": 4.86159169550173,
"grad_norm": 0.14295035600662231,
"learning_rate": 4.087517564518528e-07,
"loss": 0.0117,
"step": 2810
},
{
"epoch": 4.8788927335640135,
"grad_norm": 0.14125655591487885,
"learning_rate": 3.1187017845827337e-07,
"loss": 0.0137,
"step": 2820
},
{
"epoch": 4.896193771626297,
"grad_norm": 0.20176155865192413,
"learning_rate": 2.2805543031604314e-07,
"loss": 0.0217,
"step": 2830
},
{
"epoch": 4.913494809688581,
"grad_norm": 0.24088892340660095,
"learning_rate": 1.5731849821833954e-07,
"loss": 0.0159,
"step": 2840
},
{
"epoch": 4.930795847750865,
"grad_norm": 0.27673816680908203,
"learning_rate": 9.966865415631521e-08,
"loss": 0.0164,
"step": 2850
},
{
"epoch": 4.948096885813149,
"grad_norm": 0.18399952352046967,
"learning_rate": 5.5113454703692445e-08,
"loss": 0.013,
"step": 2860
},
{
"epoch": 4.965397923875432,
"grad_norm": 0.24507218599319458,
"learning_rate": 2.3658740026311077e-08,
"loss": 0.0202,
"step": 2870
},
{
"epoch": 4.982698961937716,
"grad_norm": 0.3336757719516754,
"learning_rate": 5.3086331166074535e-09,
"loss": 0.0137,
"step": 2880
},
{
"epoch": 4.998269896193771,
"step": 2889,
"total_flos": 1.012715054916768e+17,
"train_loss": 0.04389128585140695,
"train_runtime": 1270.1086,
"train_samples_per_second": 36.394,
"train_steps_per_second": 2.275
}
],
"logging_steps": 10,
"max_steps": 2889,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.012715054916768e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}