{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 4.993485342019544,
  "eval_steps": 500,
  "global_step": 1533,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.03257328990228013,
      "grad_norm": 10.128602027893066,
      "learning_rate": 2.5974025974025972e-05,
      "loss": 1.7762,
      "step": 10
    },
    {
      "epoch": 0.06514657980456026,
      "grad_norm": 11.557284355163574,
      "learning_rate": 5.1948051948051944e-05,
      "loss": 0.5874,
      "step": 20
    },
    {
      "epoch": 0.09771986970684039,
      "grad_norm": 1.7021989822387695,
      "learning_rate": 7.792207792207793e-05,
      "loss": 0.2361,
      "step": 30
    },
    {
      "epoch": 0.13029315960912052,
      "grad_norm": 2.088670015335083,
      "learning_rate": 0.00010389610389610389,
      "loss": 0.176,
      "step": 40
    },
    {
      "epoch": 0.16286644951140064,
      "grad_norm": 2.070906400680542,
      "learning_rate": 0.00012987012987012987,
      "loss": 0.1546,
      "step": 50
    },
    {
      "epoch": 0.19543973941368079,
      "grad_norm": 1.9704210758209229,
      "learning_rate": 0.00015584415584415587,
      "loss": 0.1463,
      "step": 60
    },
    {
      "epoch": 0.2280130293159609,
      "grad_norm": 1.2319754362106323,
      "learning_rate": 0.00018181818181818183,
      "loss": 0.1306,
      "step": 70
    },
    {
      "epoch": 0.26058631921824105,
      "grad_norm": 1.4384595155715942,
      "learning_rate": 0.0001999979049808708,
      "loss": 0.1218,
      "step": 80
    },
    {
      "epoch": 0.2931596091205212,
      "grad_norm": 1.3861066102981567,
      "learning_rate": 0.00019996066263830531,
      "loss": 0.1031,
      "step": 90
    },
    {
      "epoch": 0.3257328990228013,
      "grad_norm": 1.3069312572479248,
      "learning_rate": 0.00019987688427197897,
      "loss": 0.1125,
      "step": 100
    },
    {
      "epoch": 0.3583061889250814,
      "grad_norm": 1.1740531921386719,
      "learning_rate": 0.0001997466088843548,
      "loss": 0.0984,
      "step": 110
    },
    {
      "epoch": 0.39087947882736157,
      "grad_norm": 0.831253170967102,
      "learning_rate": 0.00019956989712427577,
      "loss": 0.1013,
      "step": 120
    },
    {
      "epoch": 0.4234527687296417,
      "grad_norm": 1.1512802839279175,
      "learning_rate": 0.0001993468312587303,
      "loss": 0.0827,
      "step": 130
    },
    {
      "epoch": 0.4560260586319218,
      "grad_norm": 0.7890095710754395,
      "learning_rate": 0.00019907751513455302,
      "loss": 0.0747,
      "step": 140
    },
    {
      "epoch": 0.48859934853420195,
      "grad_norm": 0.8635206818580627,
      "learning_rate": 0.00019876207413008015,
      "loss": 0.0911,
      "step": 150
    },
    {
      "epoch": 0.5211726384364821,
      "grad_norm": 0.984118640422821,
      "learning_rate": 0.00019840065509677988,
      "loss": 0.0708,
      "step": 160
    },
    {
      "epoch": 0.5537459283387622,
      "grad_norm": 1.1899775266647339,
      "learning_rate": 0.00019799342629088702,
      "loss": 0.0759,
      "step": 170
    },
    {
      "epoch": 0.5863192182410424,
      "grad_norm": 0.8736202120780945,
      "learning_rate": 0.00019754057729507227,
      "loss": 0.0747,
      "step": 180
    },
    {
      "epoch": 0.6188925081433225,
      "grad_norm": 0.6165608763694763,
      "learning_rate": 0.0001970423189301833,
      "loss": 0.0697,
      "step": 190
    },
    {
      "epoch": 0.6514657980456026,
      "grad_norm": 1.2145845890045166,
      "learning_rate": 0.00019649888315709843,
      "loss": 0.0768,
      "step": 200
    },
    {
      "epoch": 0.6840390879478827,
      "grad_norm": 0.7146447896957397,
      "learning_rate": 0.00019591052296873888,
      "loss": 0.0575,
      "step": 210
    },
    {
      "epoch": 0.7166123778501629,
      "grad_norm": 1.0241068601608276,
      "learning_rate": 0.00019527751227228963,
      "loss": 0.0688,
      "step": 220
    },
    {
      "epoch": 0.749185667752443,
      "grad_norm": 0.8579204678535461,
      "learning_rate": 0.00019460014576168358,
      "loss": 0.0544,
      "step": 230
    },
    {
      "epoch": 0.7817589576547231,
      "grad_norm": 0.5284867286682129,
      "learning_rate": 0.0001938787387804088,
      "loss": 0.0516,
      "step": 240
    },
    {
      "epoch": 0.8143322475570033,
      "grad_norm": 0.6213068962097168,
      "learning_rate": 0.00019311362717470268,
      "loss": 0.0501,
      "step": 250
    },
    {
      "epoch": 0.8469055374592834,
      "grad_norm": 0.7320097088813782,
      "learning_rate": 0.00019230516713720052,
      "loss": 0.0581,
      "step": 260
    },
    {
      "epoch": 0.8794788273615635,
      "grad_norm": 0.36288219690322876,
      "learning_rate": 0.00019145373504111279,
      "loss": 0.0478,
      "step": 270
    },
    {
      "epoch": 0.9120521172638436,
      "grad_norm": 0.6096987724304199,
      "learning_rate": 0.00019055972726500695,
      "loss": 0.0548,
      "step": 280
    },
    {
      "epoch": 0.9446254071661238,
      "grad_norm": 0.7391396760940552,
      "learning_rate": 0.0001896235600082759,
      "loss": 0.0444,
      "step": 290
    },
    {
      "epoch": 0.9771986970684039,
      "grad_norm": 0.7820376753807068,
      "learning_rate": 0.00018864566909737937,
      "loss": 0.0566,
      "step": 300
    },
    {
      "epoch": 1.009771986970684,
      "grad_norm": 0.3828499913215637,
      "learning_rate": 0.00018762650978294758,
      "loss": 0.0533,
      "step": 310
    },
    {
      "epoch": 1.0423452768729642,
      "grad_norm": 0.49720415472984314,
      "learning_rate": 0.0001865665565278424,
      "loss": 0.0582,
      "step": 320
    },
    {
      "epoch": 1.0749185667752443,
      "grad_norm": 0.4968441128730774,
      "learning_rate": 0.00018546630278627437,
      "loss": 0.0473,
      "step": 330
    },
    {
      "epoch": 1.1074918566775245,
      "grad_norm": 0.812972366809845,
      "learning_rate": 0.00018432626077407829,
      "loss": 0.0382,
      "step": 340
    },
    {
      "epoch": 1.1400651465798046,
      "grad_norm": 0.4055006802082062,
      "learning_rate": 0.00018314696123025454,
      "loss": 0.043,
      "step": 350
    },
    {
      "epoch": 1.1726384364820848,
      "grad_norm": 0.6709649562835693,
      "learning_rate": 0.0001819289531698871,
      "loss": 0.0343,
      "step": 360
    },
    {
      "epoch": 1.205211726384365,
      "grad_norm": 0.6343613862991333,
      "learning_rate": 0.0001806728036285532,
      "loss": 0.0388,
      "step": 370
    },
    {
      "epoch": 1.237785016286645,
      "grad_norm": 0.44080787897109985,
      "learning_rate": 0.00017937909739834367,
      "loss": 0.0419,
      "step": 380
    },
    {
      "epoch": 1.2703583061889252,
      "grad_norm": 0.714772641658783,
      "learning_rate": 0.00017804843675561677,
      "loss": 0.0476,
      "step": 390
    },
    {
      "epoch": 1.3029315960912053,
      "grad_norm": 0.4127221703529358,
      "learning_rate": 0.00017668144118061262,
      "loss": 0.0354,
      "step": 400
    },
    {
      "epoch": 1.3355048859934853,
      "grad_norm": 0.44271162152290344,
      "learning_rate": 0.00017527874706905805,
      "loss": 0.0385,
      "step": 410
    },
    {
      "epoch": 1.3680781758957654,
      "grad_norm": 0.6225533485412598,
      "learning_rate": 0.00017384100743589697,
      "loss": 0.0665,
      "step": 420
    },
    {
      "epoch": 1.4006514657980456,
      "grad_norm": 0.4418353736400604,
      "learning_rate": 0.0001723688916112835,
      "loss": 0.0554,
      "step": 430
    },
    {
      "epoch": 1.4332247557003257,
      "grad_norm": 0.3992595672607422,
      "learning_rate": 0.00017086308492897983,
      "loss": 0.0522,
      "step": 440
    },
    {
      "epoch": 1.4657980456026058,
      "grad_norm": 0.45478880405426025,
      "learning_rate": 0.0001693242884073035,
      "loss": 0.0398,
      "step": 450
    },
    {
      "epoch": 1.498371335504886,
      "grad_norm": 0.5688516497612,
      "learning_rate": 0.00016775321842277312,
      "loss": 0.032,
      "step": 460
    },
    {
      "epoch": 1.5309446254071661,
      "grad_norm": 0.4958943724632263,
      "learning_rate": 0.00016615060637660388,
      "loss": 0.0422,
      "step": 470
    },
    {
      "epoch": 1.5635179153094463,
      "grad_norm": 0.4975818991661072,
      "learning_rate": 0.00016451719835420877,
      "loss": 0.0317,
      "step": 480
    },
    {
      "epoch": 1.5960912052117264,
      "grad_norm": 0.5288175344467163,
      "learning_rate": 0.00016285375477786322,
      "loss": 0.0412,
      "step": 490
    },
    {
      "epoch": 1.6286644951140063,
      "grad_norm": 0.6416569352149963,
      "learning_rate": 0.0001611610500526957,
      "loss": 0.0275,
      "step": 500
    },
    {
      "epoch": 1.6612377850162865,
      "grad_norm": 0.42349156737327576,
      "learning_rate": 0.00015943987220616855,
      "loss": 0.029,
      "step": 510
    },
    {
      "epoch": 1.6938110749185666,
      "grad_norm": 0.2572161853313446,
      "learning_rate": 0.00015769102252121702,
      "loss": 0.0308,
      "step": 520
    },
    {
      "epoch": 1.7263843648208468,
      "grad_norm": 0.5944684147834778,
      "learning_rate": 0.0001559153151632171,
      "loss": 0.0354,
      "step": 530
    },
    {
      "epoch": 1.758957654723127,
      "grad_norm": 0.5154451727867126,
      "learning_rate": 0.0001541135768009566,
      "loss": 0.0424,
      "step": 540
    },
    {
      "epoch": 1.791530944625407,
      "grad_norm": 0.5298171043395996,
      "learning_rate": 0.00015228664622178467,
      "loss": 0.0263,
      "step": 550
    },
    {
      "epoch": 1.8241042345276872,
      "grad_norm": 0.3582160472869873,
      "learning_rate": 0.00015043537394112007,
      "loss": 0.0386,
      "step": 560
    },
    {
      "epoch": 1.8566775244299674,
      "grad_norm": 0.6350813508033752,
      "learning_rate": 0.0001485606218064993,
      "loss": 0.0444,
      "step": 570
    },
    {
      "epoch": 1.8892508143322475,
      "grad_norm": 0.6265957355499268,
      "learning_rate": 0.00014666326259634918,
      "loss": 0.0417,
      "step": 580
    },
    {
      "epoch": 1.9218241042345277,
      "grad_norm": 0.41339483857154846,
      "learning_rate": 0.00014474417961367065,
      "loss": 0.0309,
      "step": 590
    },
    {
      "epoch": 1.9543973941368078,
      "grad_norm": 0.41287094354629517,
      "learning_rate": 0.000142804266274823,
      "loss": 0.0308,
      "step": 600
    },
    {
      "epoch": 1.986970684039088,
      "grad_norm": 0.5833337903022766,
      "learning_rate": 0.00014084442569359964,
      "loss": 0.0346,
      "step": 610
    },
    {
      "epoch": 2.019543973941368,
      "grad_norm": 0.3714321255683899,
      "learning_rate": 0.00013886557026078955,
      "loss": 0.0337,
      "step": 620
    },
    {
      "epoch": 2.0521172638436482,
      "grad_norm": 0.2712138295173645,
      "learning_rate": 0.0001368686212194199,
      "loss": 0.0196,
      "step": 630
    },
    {
      "epoch": 2.0846905537459284,
      "grad_norm": 0.6208952069282532,
      "learning_rate": 0.00013485450823587725,
      "loss": 0.0288,
      "step": 640
    },
    {
      "epoch": 2.1172638436482085,
      "grad_norm": 0.34229573607444763,
      "learning_rate": 0.00013282416896710778,
      "loss": 0.0246,
      "step": 650
    },
    {
      "epoch": 2.1498371335504887,
      "grad_norm": 0.4573960304260254,
      "learning_rate": 0.00013077854862409696,
      "loss": 0.0249,
      "step": 660
    },
    {
      "epoch": 2.182410423452769,
      "grad_norm": 0.3675532042980194,
      "learning_rate": 0.0001287185995318333,
      "loss": 0.022,
      "step": 670
    },
    {
      "epoch": 2.214983713355049,
      "grad_norm": 0.5372172594070435,
      "learning_rate": 0.0001266452806859594,
      "loss": 0.0343,
      "step": 680
    },
    {
      "epoch": 2.247557003257329,
      "grad_norm": 0.46904900670051575,
      "learning_rate": 0.00012455955730631804,
      "loss": 0.0233,
      "step": 690
    },
    {
      "epoch": 2.2801302931596092,
      "grad_norm": 0.4394093155860901,
      "learning_rate": 0.00012246240038760043,
      "loss": 0.0209,
      "step": 700
    },
    {
      "epoch": 2.3127035830618894,
      "grad_norm": 0.3127492368221283,
      "learning_rate": 0.00012035478624730608,
      "loss": 0.0321,
      "step": 710
    },
    {
      "epoch": 2.3452768729641695,
      "grad_norm": 0.601370096206665,
      "learning_rate": 0.00011823769607122479,
      "loss": 0.0243,
      "step": 720
    },
    {
      "epoch": 2.3778501628664497,
      "grad_norm": 0.5871070623397827,
      "learning_rate": 0.00011611211545665184,
      "loss": 0.0337,
      "step": 730
    },
    {
      "epoch": 2.41042345276873,
      "grad_norm": 0.3546801805496216,
      "learning_rate": 0.00011397903395354996,
      "loss": 0.0288,
      "step": 740
    },
    {
      "epoch": 2.44299674267101,
      "grad_norm": 0.8319407105445862,
      "learning_rate": 0.0001118394446038708,
      "loss": 0.0337,
      "step": 750
    },
    {
      "epoch": 2.47557003257329,
      "grad_norm": 0.5210663080215454,
      "learning_rate": 0.00010969434347925076,
      "loss": 0.026,
      "step": 760
    },
    {
      "epoch": 2.5081433224755703,
      "grad_norm": 0.5834184288978577,
      "learning_rate": 0.00010754472921729661,
      "loss": 0.0282,
      "step": 770
    },
    {
      "epoch": 2.5407166123778504,
      "grad_norm": 0.42890864610671997,
      "learning_rate": 0.00010539160255667623,
      "loss": 0.028,
      "step": 780
    },
    {
      "epoch": 2.5732899022801305,
      "grad_norm": 0.4473400413990021,
      "learning_rate": 0.00010323596587123145,
      "loss": 0.025,
      "step": 790
    },
    {
      "epoch": 2.6058631921824107,
      "grad_norm": 0.5189303159713745,
      "learning_rate": 0.00010107882270332952,
      "loss": 0.0293,
      "step": 800
    },
    {
      "epoch": 2.6384364820846904,
      "grad_norm": 0.43365001678466797,
      "learning_rate": 9.892117729667052e-05,
      "loss": 0.0175,
      "step": 810
    },
    {
      "epoch": 2.6710097719869705,
      "grad_norm": 0.28346696496009827,
      "learning_rate": 9.676403412876856e-05,
      "loss": 0.0334,
      "step": 820
    },
    {
      "epoch": 2.7035830618892507,
      "grad_norm": 0.3956477642059326,
      "learning_rate": 9.460839744332378e-05,
      "loss": 0.0271,
      "step": 830
    },
    {
      "epoch": 2.736156351791531,
      "grad_norm": 0.30705949664115906,
      "learning_rate": 9.245527078270341e-05,
      "loss": 0.0217,
      "step": 840
    },
    {
      "epoch": 2.768729641693811,
      "grad_norm": 0.40188854932785034,
      "learning_rate": 9.030565652074926e-05,
      "loss": 0.019,
      "step": 850
    },
    {
      "epoch": 2.801302931596091,
      "grad_norm": 0.3447129428386688,
      "learning_rate": 8.816055539612924e-05,
      "loss": 0.028,
      "step": 860
    },
    {
      "epoch": 2.8338762214983713,
      "grad_norm": 0.38768622279167175,
      "learning_rate": 8.602096604645009e-05,
      "loss": 0.0218,
      "step": 870
    },
    {
      "epoch": 2.8664495114006514,
      "grad_norm": 0.26912721991539,
      "learning_rate": 8.388788454334817e-05,
      "loss": 0.0173,
      "step": 880
    },
    {
      "epoch": 2.8990228013029316,
      "grad_norm": 0.33078861236572266,
      "learning_rate": 8.176230392877523e-05,
      "loss": 0.0233,
      "step": 890
    },
    {
      "epoch": 2.9315960912052117,
      "grad_norm": 0.24832488596439362,
      "learning_rate": 7.964521375269396e-05,
      "loss": 0.0171,
      "step": 900
    },
    {
      "epoch": 2.964169381107492,
      "grad_norm": 0.6595136523246765,
      "learning_rate": 7.753759961239964e-05,
      "loss": 0.0272,
      "step": 910
    },
    {
      "epoch": 2.996742671009772,
      "grad_norm": 0.2780207097530365,
      "learning_rate": 7.544044269368197e-05,
      "loss": 0.0338,
      "step": 920
    },
    {
      "epoch": 3.029315960912052,
      "grad_norm": 0.7173179388046265,
      "learning_rate": 7.335471931404063e-05,
      "loss": 0.0365,
      "step": 930
    },
    {
      "epoch": 3.0618892508143323,
      "grad_norm": 0.33753442764282227,
      "learning_rate": 7.128140046816671e-05,
      "loss": 0.0195,
      "step": 940
    },
    {
      "epoch": 3.0944625407166124,
      "grad_norm": 0.35064950585365295,
      "learning_rate": 6.922145137590306e-05,
      "loss": 0.02,
      "step": 950
    },
    {
      "epoch": 3.1270358306188926,
      "grad_norm": 0.39598166942596436,
      "learning_rate": 6.717583103289229e-05,
      "loss": 0.0203,
      "step": 960
    },
    {
      "epoch": 3.1596091205211727,
      "grad_norm": 0.18257524073123932,
      "learning_rate": 6.514549176412275e-05,
      "loss": 0.0134,
      "step": 970
    },
    {
      "epoch": 3.192182410423453,
      "grad_norm": 0.4458347260951996,
      "learning_rate": 6.313137878058013e-05,
      "loss": 0.0236,
      "step": 980
    },
    {
      "epoch": 3.224755700325733,
      "grad_norm": 0.22742605209350586,
      "learning_rate": 6.113442973921046e-05,
      "loss": 0.0208,
      "step": 990
    },
    {
      "epoch": 3.257328990228013,
      "grad_norm": 0.1858537793159485,
      "learning_rate": 5.9155574306400395e-05,
      "loss": 0.0218,
      "step": 1000
    },
    {
      "epoch": 3.2899022801302933,
      "grad_norm": 0.24626286327838898,
      "learning_rate": 5.7195733725176994e-05,
      "loss": 0.0232,
      "step": 1010
    },
    {
      "epoch": 3.3224755700325734,
      "grad_norm": 0.2719153165817261,
      "learning_rate": 5.525582038632934e-05,
      "loss": 0.0148,
      "step": 1020
    },
    {
      "epoch": 3.3550488599348536,
      "grad_norm": 0.218730166554451,
      "learning_rate": 5.333673740365083e-05,
      "loss": 0.0157,
      "step": 1030
    },
    {
      "epoch": 3.3876221498371337,
      "grad_norm": 0.20292945206165314,
      "learning_rate": 5.1439378193500707e-05,
      "loss": 0.0143,
      "step": 1040
    },
    {
      "epoch": 3.420195439739414,
      "grad_norm": 0.2846449017524719,
      "learning_rate": 4.956462605887994e-05,
      "loss": 0.0177,
      "step": 1050
    },
    {
      "epoch": 3.4527687296416936,
      "grad_norm": 0.322721391916275,
      "learning_rate": 4.771335377821535e-05,
      "loss": 0.0224,
      "step": 1060
    },
    {
      "epoch": 3.4853420195439737,
      "grad_norm": 0.1719449758529663,
      "learning_rate": 4.588642319904343e-05,
      "loss": 0.0234,
      "step": 1070
    },
    {
      "epoch": 3.517915309446254,
      "grad_norm": 0.44704851508140564,
      "learning_rate": 4.408468483678293e-05,
      "loss": 0.019,
      "step": 1080
    },
    {
      "epoch": 3.550488599348534,
      "grad_norm": 0.4159814417362213,
      "learning_rate": 4.230897747878303e-05,
      "loss": 0.0156,
      "step": 1090
    },
    {
      "epoch": 3.583061889250814,
      "grad_norm": 0.19604472815990448,
      "learning_rate": 4.056012779383145e-05,
      "loss": 0.0158,
      "step": 1100
    },
    {
      "epoch": 3.6156351791530943,
      "grad_norm": 0.19116809964179993,
      "learning_rate": 3.883894994730428e-05,
      "loss": 0.0174,
      "step": 1110
    },
    {
      "epoch": 3.6482084690553744,
      "grad_norm": 0.3637801706790924,
      "learning_rate": 3.714624522213681e-05,
      "loss": 0.0162,
      "step": 1120
    },
    {
      "epoch": 3.6807817589576546,
      "grad_norm": 0.1877295821905136,
      "learning_rate": 3.548280164579126e-05,
      "loss": 0.0142,
      "step": 1130
    },
    {
      "epoch": 3.7133550488599347,
      "grad_norm": 0.1830226182937622,
      "learning_rate": 3.384939362339614e-05,
      "loss": 0.0119,
      "step": 1140
    },
    {
      "epoch": 3.745928338762215,
      "grad_norm": 0.15163740515708923,
      "learning_rate": 3.224678157722689e-05,
      "loss": 0.0181,
      "step": 1150
    },
    {
      "epoch": 3.778501628664495,
      "grad_norm": 0.2479788213968277,
      "learning_rate": 3.067571159269651e-05,
      "loss": 0.0138,
      "step": 1160
    },
    {
      "epoch": 3.811074918566775,
      "grad_norm": 0.6171669960021973,
      "learning_rate": 2.913691507102019e-05,
      "loss": 0.0197,
      "step": 1170
    },
    {
      "epoch": 3.8436482084690553,
      "grad_norm": 0.18519634008407593,
      "learning_rate": 2.763110838871651e-05,
      "loss": 0.0137,
      "step": 1180
    },
    {
      "epoch": 3.8762214983713354,
      "grad_norm": 0.26303982734680176,
      "learning_rate": 2.6158992564103058e-05,
      "loss": 0.0172,
      "step": 1190
    },
    {
      "epoch": 3.9087947882736156,
      "grad_norm": 0.28331807255744934,
      "learning_rate": 2.4721252930941974e-05,
      "loss": 0.0168,
      "step": 1200
    },
    {
      "epoch": 3.9413680781758957,
      "grad_norm": 0.20530906319618225,
      "learning_rate": 2.3318558819387404e-05,
      "loss": 0.0199,
      "step": 1210
    },
    {
      "epoch": 3.973941368078176,
      "grad_norm": 0.16924133896827698,
      "learning_rate": 2.1951563244383233e-05,
      "loss": 0.0146,
      "step": 1220
    },
    {
      "epoch": 4.006514657980456,
      "grad_norm": 0.13186028599739075,
      "learning_rate": 2.0620902601656345e-05,
      "loss": 0.0124,
      "step": 1230
    },
    {
      "epoch": 4.039087947882736,
      "grad_norm": 0.24360792338848114,
      "learning_rate": 1.9327196371446776e-05,
      "loss": 0.0119,
      "step": 1240
    },
    {
      "epoch": 4.071661237785016,
      "grad_norm": 0.09876150637865067,
      "learning_rate": 1.807104683011289e-05,
      "loss": 0.012,
      "step": 1250
    },
    {
      "epoch": 4.1042345276872965,
      "grad_norm": 0.2283184826374054,
      "learning_rate": 1.6853038769745467e-05,
      "loss": 0.0142,
      "step": 1260
    },
    {
      "epoch": 4.136807817589577,
      "grad_norm": 0.32383596897125244,
      "learning_rate": 1.5673739225921758e-05,
      "loss": 0.012,
      "step": 1270
    },
    {
      "epoch": 4.169381107491857,
      "grad_norm": 0.2783248722553253,
      "learning_rate": 1.4533697213725662e-05,
      "loss": 0.0163,
      "step": 1280
    },
    {
      "epoch": 4.201954397394137,
      "grad_norm": 0.17678265273571014,
      "learning_rate": 1.3433443472157613e-05,
      "loss": 0.012,
      "step": 1290
    },
    {
      "epoch": 4.234527687296417,
      "grad_norm": 0.25102487206459045,
      "learning_rate": 1.237349021705243e-05,
      "loss": 0.0158,
      "step": 1300
    },
    {
      "epoch": 4.267100977198697,
      "grad_norm": 0.15461167693138123,
      "learning_rate": 1.1354330902620636e-05,
      "loss": 0.0126,
      "step": 1310
    },
    {
      "epoch": 4.299674267100977,
      "grad_norm": 0.24122057855129242,
      "learning_rate": 1.0376439991724096e-05,
      "loss": 0.0168,
      "step": 1320
    },
    {
      "epoch": 4.3322475570032575,
      "grad_norm": 0.14669205248355865,
      "learning_rate": 9.440272734993072e-06,
      "loss": 0.0179,
      "step": 1330
    },
    {
      "epoch": 4.364820846905538,
      "grad_norm": 0.32440969347953796,
      "learning_rate": 8.546264958887219e-06,
      "loss": 0.0197,
      "step": 1340
    },
    {
      "epoch": 4.397394136807818,
      "grad_norm": 0.14456795156002045,
      "learning_rate": 7.694832862799505e-06,
      "loss": 0.0111,
      "step": 1350
    },
    {
      "epoch": 4.429967426710098,
      "grad_norm": 0.17956456542015076,
      "learning_rate": 6.886372825297349e-06,
      "loss": 0.0085,
      "step": 1360
    },
    {
      "epoch": 4.462540716612378,
      "grad_norm": 0.30424752831459045,
      "learning_rate": 6.12126121959119e-06,
      "loss": 0.0207,
      "step": 1370
    },
    {
      "epoch": 4.495114006514658,
      "grad_norm": 0.18671758472919464,
      "learning_rate": 5.399854238316437e-06,
      "loss": 0.013,
      "step": 1380
    },
    {
      "epoch": 4.527687296416938,
      "grad_norm": 0.3565406799316406,
      "learning_rate": 4.722487727710368e-06,
      "loss": 0.0165,
      "step": 1390
    },
    {
      "epoch": 4.5602605863192185,
      "grad_norm": 0.26344749331474304,
      "learning_rate": 4.089477031261113e-06,
      "loss": 0.0148,
      "step": 1400
    },
    {
      "epoch": 4.592833876221499,
      "grad_norm": 0.18339155614376068,
      "learning_rate": 3.5011168429016083e-06,
      "loss": 0.0195,
      "step": 1410
    },
    {
      "epoch": 4.625407166123779,
      "grad_norm": 0.2598022222518921,
      "learning_rate": 2.95768106981672e-06,
      "loss": 0.0135,
      "step": 1420
    },
    {
      "epoch": 4.657980456026059,
      "grad_norm": 0.3853515684604645,
      "learning_rate": 2.4594227049277386e-06,
      "loss": 0.0177,
      "step": 1430
    },
    {
      "epoch": 4.690553745928339,
      "grad_norm": 0.13664180040359497,
      "learning_rate": 2.006573709112991e-06,
      "loss": 0.0086,
      "step": 1440
    },
    {
      "epoch": 4.723127035830619,
      "grad_norm": 0.1015399917960167,
      "learning_rate": 1.5993449032201458e-06,
      "loss": 0.0116,
      "step": 1450
    },
    {
      "epoch": 4.755700325732899,
      "grad_norm": 0.18885648250579834,
      "learning_rate": 1.237925869919887e-06,
      "loss": 0.0175,
      "step": 1460
    },
    {
      "epoch": 4.7882736156351795,
      "grad_norm": 0.18131224811077118,
      "learning_rate": 9.224848654469931e-07,
      "loss": 0.0088,
      "step": 1470
    },
    {
      "epoch": 4.82084690553746,
      "grad_norm": 0.194551482796669,
      "learning_rate": 6.531687412697496e-07,
      "loss": 0.014,
      "step": 1480
    },
    {
      "epoch": 4.85342019543974,
      "grad_norm": 0.23798178136348724,
      "learning_rate": 4.3010287572422537e-07,
      "loss": 0.0097,
      "step": 1490
    },
    {
      "epoch": 4.88599348534202,
      "grad_norm": 0.141094371676445,
      "learning_rate": 2.5339111564521844e-07,
      "loss": 0.0151,
      "step": 1500
    },
    {
      "epoch": 4.918566775244299,
      "grad_norm": 0.17839759588241577,
      "learning_rate": 1.2311572802105043e-07,
      "loss": 0.0097,
      "step": 1510
    },
    {
      "epoch": 4.95114006514658,
      "grad_norm": 0.3124100863933563,
      "learning_rate": 3.933736169471347e-08,
      "loss": 0.0098,
      "step": 1520
    },
    {
      "epoch": 4.9837133550488595,
      "grad_norm": 0.17664480209350586,
      "learning_rate": 2.0950191292112842e-09,
      "loss": 0.0142,
      "step": 1530
    },
    {
      "epoch": 4.993485342019544,
      "step": 1533,
      "total_flos": 5.185032946443418e+16,
      "train_loss": 0.05182663513868756,
      "train_runtime": 722.6992,
      "train_samples_per_second": 33.939,
      "train_steps_per_second": 2.121
    }
  ],
  "logging_steps": 10,
  "max_steps": 1533,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 5,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 5.185032946443418e+16,
  "train_batch_size": 16,
  "trial_name": null,
  "trial_params": null
}