flip_A0-mbzau90mqt / trainer_state.json
LegrandFrederic's picture
Upload trainer_state.json with huggingface_hub
373ec7a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.993485342019544,
"eval_steps": 500,
"global_step": 1533,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.03257328990228013,
"grad_norm": 10.128602027893066,
"learning_rate": 2.5974025974025972e-05,
"loss": 1.7762,
"step": 10
},
{
"epoch": 0.06514657980456026,
"grad_norm": 11.557284355163574,
"learning_rate": 5.1948051948051944e-05,
"loss": 0.5874,
"step": 20
},
{
"epoch": 0.09771986970684039,
"grad_norm": 1.7021989822387695,
"learning_rate": 7.792207792207793e-05,
"loss": 0.2361,
"step": 30
},
{
"epoch": 0.13029315960912052,
"grad_norm": 2.088670015335083,
"learning_rate": 0.00010389610389610389,
"loss": 0.176,
"step": 40
},
{
"epoch": 0.16286644951140064,
"grad_norm": 2.070906400680542,
"learning_rate": 0.00012987012987012987,
"loss": 0.1546,
"step": 50
},
{
"epoch": 0.19543973941368079,
"grad_norm": 1.9704210758209229,
"learning_rate": 0.00015584415584415587,
"loss": 0.1463,
"step": 60
},
{
"epoch": 0.2280130293159609,
"grad_norm": 1.2319754362106323,
"learning_rate": 0.00018181818181818183,
"loss": 0.1306,
"step": 70
},
{
"epoch": 0.26058631921824105,
"grad_norm": 1.4384595155715942,
"learning_rate": 0.0001999979049808708,
"loss": 0.1218,
"step": 80
},
{
"epoch": 0.2931596091205212,
"grad_norm": 1.3861066102981567,
"learning_rate": 0.00019996066263830531,
"loss": 0.1031,
"step": 90
},
{
"epoch": 0.3257328990228013,
"grad_norm": 1.3069312572479248,
"learning_rate": 0.00019987688427197897,
"loss": 0.1125,
"step": 100
},
{
"epoch": 0.3583061889250814,
"grad_norm": 1.1740531921386719,
"learning_rate": 0.0001997466088843548,
"loss": 0.0984,
"step": 110
},
{
"epoch": 0.39087947882736157,
"grad_norm": 0.831253170967102,
"learning_rate": 0.00019956989712427577,
"loss": 0.1013,
"step": 120
},
{
"epoch": 0.4234527687296417,
"grad_norm": 1.1512802839279175,
"learning_rate": 0.0001993468312587303,
"loss": 0.0827,
"step": 130
},
{
"epoch": 0.4560260586319218,
"grad_norm": 0.7890095710754395,
"learning_rate": 0.00019907751513455302,
"loss": 0.0747,
"step": 140
},
{
"epoch": 0.48859934853420195,
"grad_norm": 0.8635206818580627,
"learning_rate": 0.00019876207413008015,
"loss": 0.0911,
"step": 150
},
{
"epoch": 0.5211726384364821,
"grad_norm": 0.984118640422821,
"learning_rate": 0.00019840065509677988,
"loss": 0.0708,
"step": 160
},
{
"epoch": 0.5537459283387622,
"grad_norm": 1.1899775266647339,
"learning_rate": 0.00019799342629088702,
"loss": 0.0759,
"step": 170
},
{
"epoch": 0.5863192182410424,
"grad_norm": 0.8736202120780945,
"learning_rate": 0.00019754057729507227,
"loss": 0.0747,
"step": 180
},
{
"epoch": 0.6188925081433225,
"grad_norm": 0.6165608763694763,
"learning_rate": 0.0001970423189301833,
"loss": 0.0697,
"step": 190
},
{
"epoch": 0.6514657980456026,
"grad_norm": 1.2145845890045166,
"learning_rate": 0.00019649888315709843,
"loss": 0.0768,
"step": 200
},
{
"epoch": 0.6840390879478827,
"grad_norm": 0.7146447896957397,
"learning_rate": 0.00019591052296873888,
"loss": 0.0575,
"step": 210
},
{
"epoch": 0.7166123778501629,
"grad_norm": 1.0241068601608276,
"learning_rate": 0.00019527751227228963,
"loss": 0.0688,
"step": 220
},
{
"epoch": 0.749185667752443,
"grad_norm": 0.8579204678535461,
"learning_rate": 0.00019460014576168358,
"loss": 0.0544,
"step": 230
},
{
"epoch": 0.7817589576547231,
"grad_norm": 0.5284867286682129,
"learning_rate": 0.0001938787387804088,
"loss": 0.0516,
"step": 240
},
{
"epoch": 0.8143322475570033,
"grad_norm": 0.6213068962097168,
"learning_rate": 0.00019311362717470268,
"loss": 0.0501,
"step": 250
},
{
"epoch": 0.8469055374592834,
"grad_norm": 0.7320097088813782,
"learning_rate": 0.00019230516713720052,
"loss": 0.0581,
"step": 260
},
{
"epoch": 0.8794788273615635,
"grad_norm": 0.36288219690322876,
"learning_rate": 0.00019145373504111279,
"loss": 0.0478,
"step": 270
},
{
"epoch": 0.9120521172638436,
"grad_norm": 0.6096987724304199,
"learning_rate": 0.00019055972726500695,
"loss": 0.0548,
"step": 280
},
{
"epoch": 0.9446254071661238,
"grad_norm": 0.7391396760940552,
"learning_rate": 0.0001896235600082759,
"loss": 0.0444,
"step": 290
},
{
"epoch": 0.9771986970684039,
"grad_norm": 0.7820376753807068,
"learning_rate": 0.00018864566909737937,
"loss": 0.0566,
"step": 300
},
{
"epoch": 1.009771986970684,
"grad_norm": 0.3828499913215637,
"learning_rate": 0.00018762650978294758,
"loss": 0.0533,
"step": 310
},
{
"epoch": 1.0423452768729642,
"grad_norm": 0.49720415472984314,
"learning_rate": 0.0001865665565278424,
"loss": 0.0582,
"step": 320
},
{
"epoch": 1.0749185667752443,
"grad_norm": 0.4968441128730774,
"learning_rate": 0.00018546630278627437,
"loss": 0.0473,
"step": 330
},
{
"epoch": 1.1074918566775245,
"grad_norm": 0.812972366809845,
"learning_rate": 0.00018432626077407829,
"loss": 0.0382,
"step": 340
},
{
"epoch": 1.1400651465798046,
"grad_norm": 0.4055006802082062,
"learning_rate": 0.00018314696123025454,
"loss": 0.043,
"step": 350
},
{
"epoch": 1.1726384364820848,
"grad_norm": 0.6709649562835693,
"learning_rate": 0.0001819289531698871,
"loss": 0.0343,
"step": 360
},
{
"epoch": 1.205211726384365,
"grad_norm": 0.6343613862991333,
"learning_rate": 0.0001806728036285532,
"loss": 0.0388,
"step": 370
},
{
"epoch": 1.237785016286645,
"grad_norm": 0.44080787897109985,
"learning_rate": 0.00017937909739834367,
"loss": 0.0419,
"step": 380
},
{
"epoch": 1.2703583061889252,
"grad_norm": 0.714772641658783,
"learning_rate": 0.00017804843675561677,
"loss": 0.0476,
"step": 390
},
{
"epoch": 1.3029315960912053,
"grad_norm": 0.4127221703529358,
"learning_rate": 0.00017668144118061262,
"loss": 0.0354,
"step": 400
},
{
"epoch": 1.3355048859934853,
"grad_norm": 0.44271162152290344,
"learning_rate": 0.00017527874706905805,
"loss": 0.0385,
"step": 410
},
{
"epoch": 1.3680781758957654,
"grad_norm": 0.6225533485412598,
"learning_rate": 0.00017384100743589697,
"loss": 0.0665,
"step": 420
},
{
"epoch": 1.4006514657980456,
"grad_norm": 0.4418353736400604,
"learning_rate": 0.0001723688916112835,
"loss": 0.0554,
"step": 430
},
{
"epoch": 1.4332247557003257,
"grad_norm": 0.3992595672607422,
"learning_rate": 0.00017086308492897983,
"loss": 0.0522,
"step": 440
},
{
"epoch": 1.4657980456026058,
"grad_norm": 0.45478880405426025,
"learning_rate": 0.0001693242884073035,
"loss": 0.0398,
"step": 450
},
{
"epoch": 1.498371335504886,
"grad_norm": 0.5688516497612,
"learning_rate": 0.00016775321842277312,
"loss": 0.032,
"step": 460
},
{
"epoch": 1.5309446254071661,
"grad_norm": 0.4958943724632263,
"learning_rate": 0.00016615060637660388,
"loss": 0.0422,
"step": 470
},
{
"epoch": 1.5635179153094463,
"grad_norm": 0.4975818991661072,
"learning_rate": 0.00016451719835420877,
"loss": 0.0317,
"step": 480
},
{
"epoch": 1.5960912052117264,
"grad_norm": 0.5288175344467163,
"learning_rate": 0.00016285375477786322,
"loss": 0.0412,
"step": 490
},
{
"epoch": 1.6286644951140063,
"grad_norm": 0.6416569352149963,
"learning_rate": 0.0001611610500526957,
"loss": 0.0275,
"step": 500
},
{
"epoch": 1.6612377850162865,
"grad_norm": 0.42349156737327576,
"learning_rate": 0.00015943987220616855,
"loss": 0.029,
"step": 510
},
{
"epoch": 1.6938110749185666,
"grad_norm": 0.2572161853313446,
"learning_rate": 0.00015769102252121702,
"loss": 0.0308,
"step": 520
},
{
"epoch": 1.7263843648208468,
"grad_norm": 0.5944684147834778,
"learning_rate": 0.0001559153151632171,
"loss": 0.0354,
"step": 530
},
{
"epoch": 1.758957654723127,
"grad_norm": 0.5154451727867126,
"learning_rate": 0.0001541135768009566,
"loss": 0.0424,
"step": 540
},
{
"epoch": 1.791530944625407,
"grad_norm": 0.5298171043395996,
"learning_rate": 0.00015228664622178467,
"loss": 0.0263,
"step": 550
},
{
"epoch": 1.8241042345276872,
"grad_norm": 0.3582160472869873,
"learning_rate": 0.00015043537394112007,
"loss": 0.0386,
"step": 560
},
{
"epoch": 1.8566775244299674,
"grad_norm": 0.6350813508033752,
"learning_rate": 0.0001485606218064993,
"loss": 0.0444,
"step": 570
},
{
"epoch": 1.8892508143322475,
"grad_norm": 0.6265957355499268,
"learning_rate": 0.00014666326259634918,
"loss": 0.0417,
"step": 580
},
{
"epoch": 1.9218241042345277,
"grad_norm": 0.41339483857154846,
"learning_rate": 0.00014474417961367065,
"loss": 0.0309,
"step": 590
},
{
"epoch": 1.9543973941368078,
"grad_norm": 0.41287094354629517,
"learning_rate": 0.000142804266274823,
"loss": 0.0308,
"step": 600
},
{
"epoch": 1.986970684039088,
"grad_norm": 0.5833337903022766,
"learning_rate": 0.00014084442569359964,
"loss": 0.0346,
"step": 610
},
{
"epoch": 2.019543973941368,
"grad_norm": 0.3714321255683899,
"learning_rate": 0.00013886557026078955,
"loss": 0.0337,
"step": 620
},
{
"epoch": 2.0521172638436482,
"grad_norm": 0.2712138295173645,
"learning_rate": 0.0001368686212194199,
"loss": 0.0196,
"step": 630
},
{
"epoch": 2.0846905537459284,
"grad_norm": 0.6208952069282532,
"learning_rate": 0.00013485450823587725,
"loss": 0.0288,
"step": 640
},
{
"epoch": 2.1172638436482085,
"grad_norm": 0.34229573607444763,
"learning_rate": 0.00013282416896710778,
"loss": 0.0246,
"step": 650
},
{
"epoch": 2.1498371335504887,
"grad_norm": 0.4573960304260254,
"learning_rate": 0.00013077854862409696,
"loss": 0.0249,
"step": 660
},
{
"epoch": 2.182410423452769,
"grad_norm": 0.3675532042980194,
"learning_rate": 0.0001287185995318333,
"loss": 0.022,
"step": 670
},
{
"epoch": 2.214983713355049,
"grad_norm": 0.5372172594070435,
"learning_rate": 0.0001266452806859594,
"loss": 0.0343,
"step": 680
},
{
"epoch": 2.247557003257329,
"grad_norm": 0.46904900670051575,
"learning_rate": 0.00012455955730631804,
"loss": 0.0233,
"step": 690
},
{
"epoch": 2.2801302931596092,
"grad_norm": 0.4394093155860901,
"learning_rate": 0.00012246240038760043,
"loss": 0.0209,
"step": 700
},
{
"epoch": 2.3127035830618894,
"grad_norm": 0.3127492368221283,
"learning_rate": 0.00012035478624730608,
"loss": 0.0321,
"step": 710
},
{
"epoch": 2.3452768729641695,
"grad_norm": 0.601370096206665,
"learning_rate": 0.00011823769607122479,
"loss": 0.0243,
"step": 720
},
{
"epoch": 2.3778501628664497,
"grad_norm": 0.5871070623397827,
"learning_rate": 0.00011611211545665184,
"loss": 0.0337,
"step": 730
},
{
"epoch": 2.41042345276873,
"grad_norm": 0.3546801805496216,
"learning_rate": 0.00011397903395354996,
"loss": 0.0288,
"step": 740
},
{
"epoch": 2.44299674267101,
"grad_norm": 0.8319407105445862,
"learning_rate": 0.0001118394446038708,
"loss": 0.0337,
"step": 750
},
{
"epoch": 2.47557003257329,
"grad_norm": 0.5210663080215454,
"learning_rate": 0.00010969434347925076,
"loss": 0.026,
"step": 760
},
{
"epoch": 2.5081433224755703,
"grad_norm": 0.5834184288978577,
"learning_rate": 0.00010754472921729661,
"loss": 0.0282,
"step": 770
},
{
"epoch": 2.5407166123778504,
"grad_norm": 0.42890864610671997,
"learning_rate": 0.00010539160255667623,
"loss": 0.028,
"step": 780
},
{
"epoch": 2.5732899022801305,
"grad_norm": 0.4473400413990021,
"learning_rate": 0.00010323596587123145,
"loss": 0.025,
"step": 790
},
{
"epoch": 2.6058631921824107,
"grad_norm": 0.5189303159713745,
"learning_rate": 0.00010107882270332952,
"loss": 0.0293,
"step": 800
},
{
"epoch": 2.6384364820846904,
"grad_norm": 0.43365001678466797,
"learning_rate": 9.892117729667052e-05,
"loss": 0.0175,
"step": 810
},
{
"epoch": 2.6710097719869705,
"grad_norm": 0.28346696496009827,
"learning_rate": 9.676403412876856e-05,
"loss": 0.0334,
"step": 820
},
{
"epoch": 2.7035830618892507,
"grad_norm": 0.3956477642059326,
"learning_rate": 9.460839744332378e-05,
"loss": 0.0271,
"step": 830
},
{
"epoch": 2.736156351791531,
"grad_norm": 0.30705949664115906,
"learning_rate": 9.245527078270341e-05,
"loss": 0.0217,
"step": 840
},
{
"epoch": 2.768729641693811,
"grad_norm": 0.40188854932785034,
"learning_rate": 9.030565652074926e-05,
"loss": 0.019,
"step": 850
},
{
"epoch": 2.801302931596091,
"grad_norm": 0.3447129428386688,
"learning_rate": 8.816055539612924e-05,
"loss": 0.028,
"step": 860
},
{
"epoch": 2.8338762214983713,
"grad_norm": 0.38768622279167175,
"learning_rate": 8.602096604645009e-05,
"loss": 0.0218,
"step": 870
},
{
"epoch": 2.8664495114006514,
"grad_norm": 0.26912721991539,
"learning_rate": 8.388788454334817e-05,
"loss": 0.0173,
"step": 880
},
{
"epoch": 2.8990228013029316,
"grad_norm": 0.33078861236572266,
"learning_rate": 8.176230392877523e-05,
"loss": 0.0233,
"step": 890
},
{
"epoch": 2.9315960912052117,
"grad_norm": 0.24832488596439362,
"learning_rate": 7.964521375269396e-05,
"loss": 0.0171,
"step": 900
},
{
"epoch": 2.964169381107492,
"grad_norm": 0.6595136523246765,
"learning_rate": 7.753759961239964e-05,
"loss": 0.0272,
"step": 910
},
{
"epoch": 2.996742671009772,
"grad_norm": 0.2780207097530365,
"learning_rate": 7.544044269368197e-05,
"loss": 0.0338,
"step": 920
},
{
"epoch": 3.029315960912052,
"grad_norm": 0.7173179388046265,
"learning_rate": 7.335471931404063e-05,
"loss": 0.0365,
"step": 930
},
{
"epoch": 3.0618892508143323,
"grad_norm": 0.33753442764282227,
"learning_rate": 7.128140046816671e-05,
"loss": 0.0195,
"step": 940
},
{
"epoch": 3.0944625407166124,
"grad_norm": 0.35064950585365295,
"learning_rate": 6.922145137590306e-05,
"loss": 0.02,
"step": 950
},
{
"epoch": 3.1270358306188926,
"grad_norm": 0.39598166942596436,
"learning_rate": 6.717583103289229e-05,
"loss": 0.0203,
"step": 960
},
{
"epoch": 3.1596091205211727,
"grad_norm": 0.18257524073123932,
"learning_rate": 6.514549176412275e-05,
"loss": 0.0134,
"step": 970
},
{
"epoch": 3.192182410423453,
"grad_norm": 0.4458347260951996,
"learning_rate": 6.313137878058013e-05,
"loss": 0.0236,
"step": 980
},
{
"epoch": 3.224755700325733,
"grad_norm": 0.22742605209350586,
"learning_rate": 6.113442973921046e-05,
"loss": 0.0208,
"step": 990
},
{
"epoch": 3.257328990228013,
"grad_norm": 0.1858537793159485,
"learning_rate": 5.9155574306400395e-05,
"loss": 0.0218,
"step": 1000
},
{
"epoch": 3.2899022801302933,
"grad_norm": 0.24626286327838898,
"learning_rate": 5.7195733725176994e-05,
"loss": 0.0232,
"step": 1010
},
{
"epoch": 3.3224755700325734,
"grad_norm": 0.2719153165817261,
"learning_rate": 5.525582038632934e-05,
"loss": 0.0148,
"step": 1020
},
{
"epoch": 3.3550488599348536,
"grad_norm": 0.218730166554451,
"learning_rate": 5.333673740365083e-05,
"loss": 0.0157,
"step": 1030
},
{
"epoch": 3.3876221498371337,
"grad_norm": 0.20292945206165314,
"learning_rate": 5.1439378193500707e-05,
"loss": 0.0143,
"step": 1040
},
{
"epoch": 3.420195439739414,
"grad_norm": 0.2846449017524719,
"learning_rate": 4.956462605887994e-05,
"loss": 0.0177,
"step": 1050
},
{
"epoch": 3.4527687296416936,
"grad_norm": 0.322721391916275,
"learning_rate": 4.771335377821535e-05,
"loss": 0.0224,
"step": 1060
},
{
"epoch": 3.4853420195439737,
"grad_norm": 0.1719449758529663,
"learning_rate": 4.588642319904343e-05,
"loss": 0.0234,
"step": 1070
},
{
"epoch": 3.517915309446254,
"grad_norm": 0.44704851508140564,
"learning_rate": 4.408468483678293e-05,
"loss": 0.019,
"step": 1080
},
{
"epoch": 3.550488599348534,
"grad_norm": 0.4159814417362213,
"learning_rate": 4.230897747878303e-05,
"loss": 0.0156,
"step": 1090
},
{
"epoch": 3.583061889250814,
"grad_norm": 0.19604472815990448,
"learning_rate": 4.056012779383145e-05,
"loss": 0.0158,
"step": 1100
},
{
"epoch": 3.6156351791530943,
"grad_norm": 0.19116809964179993,
"learning_rate": 3.883894994730428e-05,
"loss": 0.0174,
"step": 1110
},
{
"epoch": 3.6482084690553744,
"grad_norm": 0.3637801706790924,
"learning_rate": 3.714624522213681e-05,
"loss": 0.0162,
"step": 1120
},
{
"epoch": 3.6807817589576546,
"grad_norm": 0.1877295821905136,
"learning_rate": 3.548280164579126e-05,
"loss": 0.0142,
"step": 1130
},
{
"epoch": 3.7133550488599347,
"grad_norm": 0.1830226182937622,
"learning_rate": 3.384939362339614e-05,
"loss": 0.0119,
"step": 1140
},
{
"epoch": 3.745928338762215,
"grad_norm": 0.15163740515708923,
"learning_rate": 3.224678157722689e-05,
"loss": 0.0181,
"step": 1150
},
{
"epoch": 3.778501628664495,
"grad_norm": 0.2479788213968277,
"learning_rate": 3.067571159269651e-05,
"loss": 0.0138,
"step": 1160
},
{
"epoch": 3.811074918566775,
"grad_norm": 0.6171669960021973,
"learning_rate": 2.913691507102019e-05,
"loss": 0.0197,
"step": 1170
},
{
"epoch": 3.8436482084690553,
"grad_norm": 0.18519634008407593,
"learning_rate": 2.763110838871651e-05,
"loss": 0.0137,
"step": 1180
},
{
"epoch": 3.8762214983713354,
"grad_norm": 0.26303982734680176,
"learning_rate": 2.6158992564103058e-05,
"loss": 0.0172,
"step": 1190
},
{
"epoch": 3.9087947882736156,
"grad_norm": 0.28331807255744934,
"learning_rate": 2.4721252930941974e-05,
"loss": 0.0168,
"step": 1200
},
{
"epoch": 3.9413680781758957,
"grad_norm": 0.20530906319618225,
"learning_rate": 2.3318558819387404e-05,
"loss": 0.0199,
"step": 1210
},
{
"epoch": 3.973941368078176,
"grad_norm": 0.16924133896827698,
"learning_rate": 2.1951563244383233e-05,
"loss": 0.0146,
"step": 1220
},
{
"epoch": 4.006514657980456,
"grad_norm": 0.13186028599739075,
"learning_rate": 2.0620902601656345e-05,
"loss": 0.0124,
"step": 1230
},
{
"epoch": 4.039087947882736,
"grad_norm": 0.24360792338848114,
"learning_rate": 1.9327196371446776e-05,
"loss": 0.0119,
"step": 1240
},
{
"epoch": 4.071661237785016,
"grad_norm": 0.09876150637865067,
"learning_rate": 1.807104683011289e-05,
"loss": 0.012,
"step": 1250
},
{
"epoch": 4.1042345276872965,
"grad_norm": 0.2283184826374054,
"learning_rate": 1.6853038769745467e-05,
"loss": 0.0142,
"step": 1260
},
{
"epoch": 4.136807817589577,
"grad_norm": 0.32383596897125244,
"learning_rate": 1.5673739225921758e-05,
"loss": 0.012,
"step": 1270
},
{
"epoch": 4.169381107491857,
"grad_norm": 0.2783248722553253,
"learning_rate": 1.4533697213725662e-05,
"loss": 0.0163,
"step": 1280
},
{
"epoch": 4.201954397394137,
"grad_norm": 0.17678265273571014,
"learning_rate": 1.3433443472157613e-05,
"loss": 0.012,
"step": 1290
},
{
"epoch": 4.234527687296417,
"grad_norm": 0.25102487206459045,
"learning_rate": 1.237349021705243e-05,
"loss": 0.0158,
"step": 1300
},
{
"epoch": 4.267100977198697,
"grad_norm": 0.15461167693138123,
"learning_rate": 1.1354330902620636e-05,
"loss": 0.0126,
"step": 1310
},
{
"epoch": 4.299674267100977,
"grad_norm": 0.24122057855129242,
"learning_rate": 1.0376439991724096e-05,
"loss": 0.0168,
"step": 1320
},
{
"epoch": 4.3322475570032575,
"grad_norm": 0.14669205248355865,
"learning_rate": 9.440272734993072e-06,
"loss": 0.0179,
"step": 1330
},
{
"epoch": 4.364820846905538,
"grad_norm": 0.32440969347953796,
"learning_rate": 8.546264958887219e-06,
"loss": 0.0197,
"step": 1340
},
{
"epoch": 4.397394136807818,
"grad_norm": 0.14456795156002045,
"learning_rate": 7.694832862799505e-06,
"loss": 0.0111,
"step": 1350
},
{
"epoch": 4.429967426710098,
"grad_norm": 0.17956456542015076,
"learning_rate": 6.886372825297349e-06,
"loss": 0.0085,
"step": 1360
},
{
"epoch": 4.462540716612378,
"grad_norm": 0.30424752831459045,
"learning_rate": 6.12126121959119e-06,
"loss": 0.0207,
"step": 1370
},
{
"epoch": 4.495114006514658,
"grad_norm": 0.18671758472919464,
"learning_rate": 5.399854238316437e-06,
"loss": 0.013,
"step": 1380
},
{
"epoch": 4.527687296416938,
"grad_norm": 0.3565406799316406,
"learning_rate": 4.722487727710368e-06,
"loss": 0.0165,
"step": 1390
},
{
"epoch": 4.5602605863192185,
"grad_norm": 0.26344749331474304,
"learning_rate": 4.089477031261113e-06,
"loss": 0.0148,
"step": 1400
},
{
"epoch": 4.592833876221499,
"grad_norm": 0.18339155614376068,
"learning_rate": 3.5011168429016083e-06,
"loss": 0.0195,
"step": 1410
},
{
"epoch": 4.625407166123779,
"grad_norm": 0.2598022222518921,
"learning_rate": 2.95768106981672e-06,
"loss": 0.0135,
"step": 1420
},
{
"epoch": 4.657980456026059,
"grad_norm": 0.3853515684604645,
"learning_rate": 2.4594227049277386e-06,
"loss": 0.0177,
"step": 1430
},
{
"epoch": 4.690553745928339,
"grad_norm": 0.13664180040359497,
"learning_rate": 2.006573709112991e-06,
"loss": 0.0086,
"step": 1440
},
{
"epoch": 4.723127035830619,
"grad_norm": 0.1015399917960167,
"learning_rate": 1.5993449032201458e-06,
"loss": 0.0116,
"step": 1450
},
{
"epoch": 4.755700325732899,
"grad_norm": 0.18885648250579834,
"learning_rate": 1.237925869919887e-06,
"loss": 0.0175,
"step": 1460
},
{
"epoch": 4.7882736156351795,
"grad_norm": 0.18131224811077118,
"learning_rate": 9.224848654469931e-07,
"loss": 0.0088,
"step": 1470
},
{
"epoch": 4.82084690553746,
"grad_norm": 0.194551482796669,
"learning_rate": 6.531687412697496e-07,
"loss": 0.014,
"step": 1480
},
{
"epoch": 4.85342019543974,
"grad_norm": 0.23798178136348724,
"learning_rate": 4.3010287572422537e-07,
"loss": 0.0097,
"step": 1490
},
{
"epoch": 4.88599348534202,
"grad_norm": 0.141094371676445,
"learning_rate": 2.5339111564521844e-07,
"loss": 0.0151,
"step": 1500
},
{
"epoch": 4.918566775244299,
"grad_norm": 0.17839759588241577,
"learning_rate": 1.2311572802105043e-07,
"loss": 0.0097,
"step": 1510
},
{
"epoch": 4.95114006514658,
"grad_norm": 0.3124100863933563,
"learning_rate": 3.933736169471347e-08,
"loss": 0.0098,
"step": 1520
},
{
"epoch": 4.9837133550488595,
"grad_norm": 0.17664480209350586,
"learning_rate": 2.0950191292112842e-09,
"loss": 0.0142,
"step": 1530
},
{
"epoch": 4.993485342019544,
"step": 1533,
"total_flos": 5.185032946443418e+16,
"train_loss": 0.05182663513868756,
"train_runtime": 722.6992,
"train_samples_per_second": 33.939,
"train_steps_per_second": 2.121
}
],
"logging_steps": 10,
"max_steps": 1533,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.185032946443418e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}