{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.030517578125,
  "eval_steps": 500,
  "global_step": 8000,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0003814697265625,
      "grad_norm": 1.8202160596847534,
      "learning_rate": 4e-05,
      "loss": 1.7732,
      "step": 100
    },
    {
      "epoch": 0.000762939453125,
      "grad_norm": 1.0439223051071167,
      "learning_rate": 8e-05,
      "loss": 0.3055,
      "step": 200
    },
    {
      "epoch": 0.0011444091796875,
      "grad_norm": 0.49549755454063416,
      "learning_rate": 0.00012,
      "loss": 0.2596,
      "step": 300
    },
    {
      "epoch": 0.00152587890625,
      "grad_norm": 0.9390599727630615,
      "learning_rate": 0.00016,
      "loss": 0.2547,
      "step": 400
    },
    {
      "epoch": 0.0019073486328125,
      "grad_norm": 0.4973730444908142,
      "learning_rate": 0.0002,
      "loss": 0.2534,
      "step": 500
    },
    {
      "epoch": 0.002288818359375,
      "grad_norm": 0.45400354266166687,
      "learning_rate": 0.000199748427672956,
      "loss": 0.2446,
      "step": 600
    },
    {
      "epoch": 0.0026702880859375,
      "grad_norm": 0.39165329933166504,
      "learning_rate": 0.00019949685534591195,
      "loss": 0.2411,
      "step": 700
    },
    {
      "epoch": 0.0030517578125,
      "grad_norm": 0.4248354434967041,
      "learning_rate": 0.00019924528301886794,
      "loss": 0.2392,
      "step": 800
    },
    {
      "epoch": 0.0034332275390625,
      "grad_norm": 0.35752373933792114,
      "learning_rate": 0.0001989937106918239,
      "loss": 0.236,
      "step": 900
    },
    {
      "epoch": 0.003814697265625,
      "grad_norm": 0.39206448197364807,
      "learning_rate": 0.00019874213836477988,
      "loss": 0.2322,
      "step": 1000
    },
    {
      "epoch": 0.0041961669921875,
      "grad_norm": 0.3509558439254761,
      "learning_rate": 0.00019849056603773587,
      "loss": 0.2312,
      "step": 1100
    },
    {
      "epoch": 0.00457763671875,
      "grad_norm": 0.3513820171356201,
      "learning_rate": 0.00019823899371069183,
      "loss": 0.2308,
      "step": 1200
    },
    {
      "epoch": 0.0049591064453125,
      "grad_norm": 0.434176504611969,
      "learning_rate": 0.0001979874213836478,
      "loss": 0.2284,
      "step": 1300
    },
    {
      "epoch": 0.005340576171875,
      "grad_norm": 0.37612399458885193,
      "learning_rate": 0.0001977358490566038,
      "loss": 0.2289,
      "step": 1400
    },
    {
      "epoch": 0.0057220458984375,
      "grad_norm": 0.3991953134536743,
      "learning_rate": 0.00019748427672955975,
      "loss": 0.23,
      "step": 1500
    },
    {
      "epoch": 0.006103515625,
      "grad_norm": 0.4121605157852173,
      "learning_rate": 0.00019723270440251574,
      "loss": 0.2284,
      "step": 1600
    },
    {
      "epoch": 0.0064849853515625,
      "grad_norm": 0.3937987983226776,
      "learning_rate": 0.0001969811320754717,
      "loss": 0.2249,
      "step": 1700
    },
    {
      "epoch": 0.006866455078125,
      "grad_norm": 0.2995181083679199,
      "learning_rate": 0.00019672955974842768,
      "loss": 0.2257,
      "step": 1800
    },
    {
      "epoch": 0.0072479248046875,
      "grad_norm": 0.5119357705116272,
      "learning_rate": 0.00019647798742138367,
      "loss": 0.2292,
      "step": 1900
    },
    {
      "epoch": 0.00762939453125,
      "grad_norm": 0.31295427680015564,
      "learning_rate": 0.00019622641509433963,
      "loss": 0.2289,
      "step": 2000
    },
    {
      "epoch": 0.0080108642578125,
      "grad_norm": 0.2797456979751587,
      "learning_rate": 0.0001959748427672956,
      "loss": 0.2232,
      "step": 2100
    },
    {
      "epoch": 0.008392333984375,
      "grad_norm": 0.45458996295928955,
      "learning_rate": 0.00019572327044025157,
      "loss": 0.2244,
      "step": 2200
    },
    {
      "epoch": 0.0087738037109375,
      "grad_norm": 0.29631954431533813,
      "learning_rate": 0.00019547169811320755,
      "loss": 0.2234,
      "step": 2300
    },
    {
      "epoch": 0.0091552734375,
      "grad_norm": 0.5060445070266724,
      "learning_rate": 0.00019522012578616354,
      "loss": 0.2265,
      "step": 2400
    },
    {
      "epoch": 0.0095367431640625,
      "grad_norm": 0.28566980361938477,
      "learning_rate": 0.0001949685534591195,
      "loss": 0.2279,
      "step": 2500
    },
    {
      "epoch": 0.009918212890625,
      "grad_norm": 0.24325500428676605,
      "learning_rate": 0.00019471698113207548,
      "loss": 0.2306,
      "step": 2600
    },
    {
      "epoch": 0.0102996826171875,
      "grad_norm": 0.3140350878238678,
      "learning_rate": 0.00019446540880503147,
      "loss": 0.2234,
      "step": 2700
    },
    {
      "epoch": 0.01068115234375,
      "grad_norm": 0.4366394877433777,
      "learning_rate": 0.00019421383647798743,
      "loss": 0.2224,
      "step": 2800
    },
    {
      "epoch": 0.0110626220703125,
      "grad_norm": 0.27782708406448364,
      "learning_rate": 0.0001939622641509434,
      "loss": 0.2236,
      "step": 2900
    },
    {
      "epoch": 0.011444091796875,
      "grad_norm": 0.3332788944244385,
      "learning_rate": 0.00019371069182389937,
      "loss": 0.2241,
      "step": 3000
    },
    {
      "epoch": 0.0118255615234375,
      "grad_norm": 0.3888827860355377,
      "learning_rate": 0.00019345911949685536,
      "loss": 0.2217,
      "step": 3100
    },
    {
      "epoch": 0.01220703125,
      "grad_norm": 0.24029745161533356,
      "learning_rate": 0.00019320754716981134,
      "loss": 0.2216,
      "step": 3200
    },
    {
      "epoch": 0.0125885009765625,
      "grad_norm": 1.8477509021759033,
      "learning_rate": 0.0001929559748427673,
      "loss": 0.2252,
      "step": 3300
    },
    {
      "epoch": 0.012969970703125,
      "grad_norm": 0.5924927592277527,
      "learning_rate": 0.00019270440251572328,
      "loss": 0.2352,
      "step": 3400
    },
    {
      "epoch": 0.0133514404296875,
      "grad_norm": 0.33940935134887695,
      "learning_rate": 0.00019245283018867927,
      "loss": 0.2253,
      "step": 3500
    },
    {
      "epoch": 0.01373291015625,
      "grad_norm": 0.3898316025733948,
      "learning_rate": 0.00019220125786163523,
      "loss": 0.2216,
      "step": 3600
    },
    {
      "epoch": 0.0141143798828125,
      "grad_norm": 0.2601265609264374,
      "learning_rate": 0.0001919496855345912,
      "loss": 0.2261,
      "step": 3700
    },
    {
      "epoch": 0.014495849609375,
      "grad_norm": 0.32615959644317627,
      "learning_rate": 0.00019169811320754717,
      "loss": 0.2225,
      "step": 3800
    },
    {
      "epoch": 0.0148773193359375,
      "grad_norm": 0.2891947627067566,
      "learning_rate": 0.00019144654088050316,
      "loss": 0.2216,
      "step": 3900
    },
    {
      "epoch": 0.0152587890625,
      "grad_norm": 0.2846430242061615,
      "learning_rate": 0.00019119496855345914,
      "loss": 0.2197,
      "step": 4000
    },
    {
      "epoch": 0.0156402587890625,
      "grad_norm": 0.2938269078731537,
      "learning_rate": 0.0001909433962264151,
      "loss": 0.2212,
      "step": 4100
    },
    {
      "epoch": 0.016021728515625,
      "grad_norm": 0.2718958258628845,
      "learning_rate": 0.00019069182389937108,
      "loss": 0.2205,
      "step": 4200
    },
    {
      "epoch": 0.0164031982421875,
      "grad_norm": 0.3561397194862366,
      "learning_rate": 0.00019044025157232704,
      "loss": 0.2205,
      "step": 4300
    },
    {
      "epoch": 0.01678466796875,
      "grad_norm": 0.4546607732772827,
      "learning_rate": 0.00019018867924528303,
      "loss": 0.2234,
      "step": 4400
    },
    {
      "epoch": 0.0171661376953125,
      "grad_norm": 0.29250577092170715,
      "learning_rate": 0.00018993710691823901,
      "loss": 0.2197,
      "step": 4500
    },
    {
      "epoch": 0.017547607421875,
      "grad_norm": 1.6952908039093018,
      "learning_rate": 0.00018968553459119497,
      "loss": 0.2217,
      "step": 4600
    },
    {
      "epoch": 0.0179290771484375,
      "grad_norm": 0.3261864483356476,
      "learning_rate": 0.00018943396226415096,
      "loss": 0.2269,
      "step": 4700
    },
    {
      "epoch": 0.018310546875,
      "grad_norm": 0.2668060064315796,
      "learning_rate": 0.00018918238993710694,
      "loss": 0.2203,
      "step": 4800
    },
    {
      "epoch": 0.0186920166015625,
      "grad_norm": 0.31689000129699707,
      "learning_rate": 0.0001889308176100629,
      "loss": 0.2201,
      "step": 4900
    },
    {
      "epoch": 0.019073486328125,
      "grad_norm": 0.26320216059684753,
      "learning_rate": 0.00018867924528301889,
      "loss": 0.2214,
      "step": 5000
    },
    {
      "epoch": 0.0194549560546875,
      "grad_norm": 0.26768413186073303,
      "learning_rate": 0.00018842767295597484,
      "loss": 0.2225,
      "step": 5100
    },
    {
      "epoch": 0.01983642578125,
      "grad_norm": 0.2808452248573303,
      "learning_rate": 0.00018817610062893083,
      "loss": 0.2208,
      "step": 5200
    },
    {
      "epoch": 0.0202178955078125,
      "grad_norm": 0.25958341360092163,
      "learning_rate": 0.00018792452830188681,
      "loss": 0.2207,
      "step": 5300
    },
    {
      "epoch": 0.020599365234375,
      "grad_norm": 0.22953402996063232,
      "learning_rate": 0.00018767295597484277,
      "loss": 0.2193,
      "step": 5400
    },
    {
      "epoch": 0.0209808349609375,
      "grad_norm": 0.9375737905502319,
      "learning_rate": 0.00018742138364779876,
      "loss": 0.2206,
      "step": 5500
    },
    {
      "epoch": 0.0213623046875,
      "grad_norm": 0.2852359712123871,
      "learning_rate": 0.00018716981132075472,
      "loss": 0.2211,
      "step": 5600
    },
    {
      "epoch": 0.0217437744140625,
      "grad_norm": 0.25367122888565063,
      "learning_rate": 0.0001869182389937107,
      "loss": 0.2191,
      "step": 5700
    },
    {
      "epoch": 0.022125244140625,
      "grad_norm": 0.2215207815170288,
      "learning_rate": 0.0001866666666666667,
      "loss": 0.2218,
      "step": 5800
    },
    {
      "epoch": 0.0225067138671875,
      "grad_norm": 0.24178574979305267,
      "learning_rate": 0.00018641509433962264,
      "loss": 0.2283,
      "step": 5900
    },
    {
      "epoch": 0.02288818359375,
      "grad_norm": 0.3638046979904175,
      "learning_rate": 0.00018616352201257863,
      "loss": 0.2217,
      "step": 6000
    },
    {
      "epoch": 0.0232696533203125,
      "grad_norm": 0.40834301710128784,
      "learning_rate": 0.00018591194968553462,
      "loss": 0.2204,
      "step": 6100
    },
    {
      "epoch": 0.023651123046875,
      "grad_norm": 0.24277737736701965,
      "learning_rate": 0.00018566037735849057,
      "loss": 0.2178,
      "step": 6200
    },
    {
      "epoch": 0.0240325927734375,
      "grad_norm": 0.3276098370552063,
      "learning_rate": 0.00018540880503144656,
      "loss": 0.2245,
      "step": 6300
    },
    {
      "epoch": 0.0244140625,
      "grad_norm": 0.40407466888427734,
      "learning_rate": 0.00018515723270440252,
      "loss": 0.2258,
      "step": 6400
    },
    {
      "epoch": 0.0247955322265625,
      "grad_norm": 0.31675395369529724,
      "learning_rate": 0.0001849056603773585,
      "loss": 0.223,
      "step": 6500
    },
    {
      "epoch": 0.025177001953125,
      "grad_norm": 0.2858389616012573,
      "learning_rate": 0.0001846540880503145,
      "loss": 0.2201,
      "step": 6600
    },
    {
      "epoch": 0.0255584716796875,
      "grad_norm": 0.2711004912853241,
      "learning_rate": 0.00018440251572327045,
      "loss": 0.2175,
      "step": 6700
    },
    {
      "epoch": 0.02593994140625,
      "grad_norm": 0.24398334324359894,
      "learning_rate": 0.00018415094339622643,
      "loss": 0.2195,
      "step": 6800
    },
    {
      "epoch": 0.0263214111328125,
      "grad_norm": 0.29580453038215637,
      "learning_rate": 0.0001838993710691824,
      "loss": 0.2198,
      "step": 6900
    },
    {
      "epoch": 0.026702880859375,
      "grad_norm": 0.2624952495098114,
      "learning_rate": 0.00018364779874213837,
      "loss": 0.217,
      "step": 7000
    },
    {
      "epoch": 0.0270843505859375,
      "grad_norm": 0.2129925936460495,
      "learning_rate": 0.00018339622641509436,
      "loss": 0.2188,
      "step": 7100
    },
    {
      "epoch": 0.0274658203125,
      "grad_norm": 0.27471479773521423,
      "learning_rate": 0.00018314465408805032,
      "loss": 0.2174,
      "step": 7200
    },
    {
      "epoch": 0.0278472900390625,
      "grad_norm": 1.0204274654388428,
      "learning_rate": 0.0001828930817610063,
      "loss": 0.2186,
      "step": 7300
    },
    {
      "epoch": 0.028228759765625,
      "grad_norm": 0.5174055695533752,
      "learning_rate": 0.0001826415094339623,
      "loss": 0.2198,
      "step": 7400
    },
    {
      "epoch": 0.0286102294921875,
      "grad_norm": 1.7667677402496338,
      "learning_rate": 0.00018238993710691825,
      "loss": 0.2221,
      "step": 7500
    },
    {
      "epoch": 0.02899169921875,
      "grad_norm": 0.34651100635528564,
      "learning_rate": 0.00018213836477987423,
      "loss": 0.2215,
      "step": 7600
    },
    {
      "epoch": 0.0293731689453125,
      "grad_norm": 0.2900320589542389,
      "learning_rate": 0.0001818867924528302,
      "loss": 0.2184,
      "step": 7700
    },
    {
      "epoch": 0.029754638671875,
      "grad_norm": 0.21523432433605194,
      "learning_rate": 0.00018163522012578617,
      "loss": 0.2171,
      "step": 7800
    },
    {
      "epoch": 0.0301361083984375,
      "grad_norm": 0.28846126794815063,
      "learning_rate": 0.00018138364779874216,
      "loss": 0.2175,
      "step": 7900
    },
    {
      "epoch": 0.030517578125,
      "grad_norm": 0.27318933606147766,
      "learning_rate": 0.00018113207547169812,
      "loss": 0.218,
      "step": 8000
    }
  ],
  "logging_steps": 100,
  "max_steps": 80000,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 500,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 9.909320422780109e+16,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}