{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 19.908045977011493,
  "eval_steps": 500,
  "global_step": 1732,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.11494252873563218,
      "grad_norm": 5.833221435546875,
      "learning_rate": 2.2988505747126437e-05,
      "loss": 1.1691,
      "step": 10
    },
    {
      "epoch": 0.22988505747126436,
      "grad_norm": 3.3890936374664307,
      "learning_rate": 4.597701149425287e-05,
      "loss": 0.4962,
      "step": 20
    },
    {
      "epoch": 0.3448275862068966,
      "grad_norm": 1.4635858535766602,
      "learning_rate": 6.896551724137931e-05,
      "loss": 0.2852,
      "step": 30
    },
    {
      "epoch": 0.45977011494252873,
      "grad_norm": 1.3263287544250488,
      "learning_rate": 9.195402298850575e-05,
      "loss": 0.2068,
      "step": 40
    },
    {
      "epoch": 0.5747126436781609,
      "grad_norm": 1.4933586120605469,
      "learning_rate": 0.00011494252873563218,
      "loss": 0.1671,
      "step": 50
    },
    {
      "epoch": 0.6896551724137931,
      "grad_norm": 3.461280584335327,
      "learning_rate": 0.00013793103448275863,
      "loss": 0.163,
      "step": 60
    },
    {
      "epoch": 0.8045977011494253,
      "grad_norm": 0.8741048574447632,
      "learning_rate": 0.00016091954022988506,
      "loss": 0.1625,
      "step": 70
    },
    {
      "epoch": 0.9195402298850575,
      "grad_norm": 0.9238091111183167,
      "learning_rate": 0.0001839080459770115,
      "loss": 0.1292,
      "step": 80
    },
    {
      "epoch": 1.0344827586206897,
      "grad_norm": 1.1964523792266846,
      "learning_rate": 0.00019999835873288997,
      "loss": 0.1137,
      "step": 90
    },
    {
      "epoch": 1.1494252873563218,
      "grad_norm": 0.6014072895050049,
      "learning_rate": 0.0001999691821496584,
      "loss": 0.1066,
      "step": 100
    },
    {
      "epoch": 1.264367816091954,
      "grad_norm": 0.6493586897850037,
      "learning_rate": 0.00019990354521250803,
      "loss": 0.1051,
      "step": 110
    },
    {
      "epoch": 1.3793103448275863,
      "grad_norm": 0.6839510202407837,
      "learning_rate": 0.00019980147186027586,
      "loss": 0.0926,
      "step": 120
    },
    {
      "epoch": 1.4942528735632183,
      "grad_norm": 0.5401099920272827,
      "learning_rate": 0.00019966299932074023,
      "loss": 0.0895,
      "step": 130
    },
    {
      "epoch": 1.6091954022988506,
      "grad_norm": 1.1563774347305298,
      "learning_rate": 0.000199488178097043,
      "loss": 0.0915,
      "step": 140
    },
    {
      "epoch": 1.7241379310344827,
      "grad_norm": 2.1474926471710205,
      "learning_rate": 0.00019927707194927066,
      "loss": 0.0853,
      "step": 150
    },
    {
      "epoch": 1.839080459770115,
      "grad_norm": 0.7267495393753052,
      "learning_rate": 0.00019902975787119956,
      "loss": 0.0873,
      "step": 160
    },
    {
      "epoch": 1.9540229885057472,
      "grad_norm": 0.5461205244064331,
      "learning_rate": 0.00019874632606221545,
      "loss": 0.0739,
      "step": 170
    },
    {
      "epoch": 2.0689655172413794,
      "grad_norm": 0.43381860852241516,
      "learning_rate": 0.00019842687989441604,
      "loss": 0.0682,
      "step": 180
    },
    {
      "epoch": 2.1839080459770113,
      "grad_norm": 0.619968593120575,
      "learning_rate": 0.00019807153587490963,
      "loss": 0.0725,
      "step": 190
    },
    {
      "epoch": 2.2988505747126435,
      "grad_norm": 0.5531813502311707,
      "learning_rate": 0.00019768042360332325,
      "loss": 0.0649,
      "step": 200
    },
    {
      "epoch": 2.413793103448276,
      "grad_norm": 0.4325454831123352,
      "learning_rate": 0.00019725368572453539,
      "loss": 0.0629,
      "step": 210
    },
    {
      "epoch": 2.528735632183908,
      "grad_norm": 1.132866382598877,
      "learning_rate": 0.00019679147787665126,
      "loss": 0.0597,
      "step": 220
    },
    {
      "epoch": 2.6436781609195403,
      "grad_norm": 0.5158783793449402,
      "learning_rate": 0.00019629396863423911,
      "loss": 0.0658,
      "step": 230
    },
    {
      "epoch": 2.7586206896551726,
      "grad_norm": 0.5275442600250244,
      "learning_rate": 0.0001957613394468484,
      "loss": 0.0624,
      "step": 240
    },
    {
      "epoch": 2.873563218390805,
      "grad_norm": 0.26212960481643677,
      "learning_rate": 0.0001951937845728321,
      "loss": 0.0565,
      "step": 250
    },
    {
      "epoch": 2.9885057471264367,
      "grad_norm": 0.4064450263977051,
      "learning_rate": 0.00019459151100849784,
      "loss": 0.0586,
      "step": 260
    },
    {
      "epoch": 3.103448275862069,
      "grad_norm": 0.497156023979187,
      "learning_rate": 0.0001939547384126128,
      "loss": 0.0569,
      "step": 270
    },
    {
      "epoch": 3.218390804597701,
      "grad_norm": 0.23238833248615265,
      "learning_rate": 0.00019328369902629083,
      "loss": 0.048,
      "step": 280
    },
    {
      "epoch": 3.3333333333333335,
      "grad_norm": 0.33057811856269836,
      "learning_rate": 0.00019257863758829035,
      "loss": 0.0508,
      "step": 290
    },
    {
      "epoch": 3.4482758620689653,
      "grad_norm": 0.2923976182937622,
      "learning_rate": 0.00019183981124575418,
      "loss": 0.059,
      "step": 300
    },
    {
      "epoch": 3.5632183908045976,
      "grad_norm": 0.40444961190223694,
      "learning_rate": 0.00019106748946042407,
      "loss": 0.0589,
      "step": 310
    },
    {
      "epoch": 3.67816091954023,
      "grad_norm": 0.3618530333042145,
      "learning_rate": 0.00019026195391036338,
      "loss": 0.0493,
      "step": 320
    },
    {
      "epoch": 3.793103448275862,
      "grad_norm": 0.2655580937862396,
      "learning_rate": 0.00018942349838722486,
      "loss": 0.0502,
      "step": 330
    },
    {
      "epoch": 3.9080459770114944,
      "grad_norm": 0.30788642168045044,
      "learning_rate": 0.0001885524286891002,
      "loss": 0.0562,
      "step": 340
    },
    {
      "epoch": 4.022988505747127,
      "grad_norm": 0.3801023066043854,
      "learning_rate": 0.00018764906250899027,
      "loss": 0.0536,
      "step": 350
    },
    {
      "epoch": 4.137931034482759,
      "grad_norm": 0.34299996495246887,
      "learning_rate": 0.00018671372931893773,
      "loss": 0.0583,
      "step": 360
    },
    {
      "epoch": 4.252873563218391,
      "grad_norm": 0.5825142860412598,
      "learning_rate": 0.0001857467702498633,
      "loss": 0.0465,
      "step": 370
    },
    {
      "epoch": 4.3678160919540225,
      "grad_norm": 0.46258264780044556,
      "learning_rate": 0.0001847485379671496,
      "loss": 0.0469,
      "step": 380
    },
    {
      "epoch": 4.482758620689655,
      "grad_norm": 0.23550616204738617,
      "learning_rate": 0.0001837193965420188,
      "loss": 0.0477,
      "step": 390
    },
    {
      "epoch": 4.597701149425287,
      "grad_norm": 0.609255850315094,
      "learning_rate": 0.00018265972131874987,
      "loss": 0.0467,
      "step": 400
    },
    {
      "epoch": 4.712643678160919,
      "grad_norm": 0.3701482117176056,
      "learning_rate": 0.00018156989877778461,
      "loss": 0.0471,
      "step": 410
    },
    {
      "epoch": 4.827586206896552,
      "grad_norm": 0.4651474356651306,
      "learning_rate": 0.00018045032639477194,
      "loss": 0.0434,
      "step": 420
    },
    {
      "epoch": 4.942528735632184,
      "grad_norm": 0.34028705954551697,
      "learning_rate": 0.00017930141249560233,
      "loss": 0.0452,
      "step": 430
    },
    {
      "epoch": 5.057471264367816,
      "grad_norm": 0.2748933732509613,
      "learning_rate": 0.00017812357610748488,
      "loss": 0.0413,
      "step": 440
    },
    {
      "epoch": 5.172413793103448,
      "grad_norm": 0.4612014591693878,
      "learning_rate": 0.00017691724680612118,
      "loss": 0.0423,
      "step": 450
    },
    {
      "epoch": 5.287356321839081,
      "grad_norm": 0.37961891293525696,
      "learning_rate": 0.00017568286455903258,
      "loss": 0.0422,
      "step": 460
    },
    {
      "epoch": 5.402298850574713,
      "grad_norm": 0.3245999813079834,
      "learning_rate": 0.00017442087956509665,
      "loss": 0.0396,
      "step": 470
    },
    {
      "epoch": 5.517241379310345,
      "grad_norm": 0.5230941772460938,
      "learning_rate": 0.00017313175209035268,
      "loss": 0.0405,
      "step": 480
    },
    {
      "epoch": 5.6321839080459775,
      "grad_norm": 0.2870311737060547,
      "learning_rate": 0.00017181595230013525,
      "loss": 0.0343,
      "step": 490
    },
    {
      "epoch": 5.747126436781609,
      "grad_norm": 0.2876773774623871,
      "learning_rate": 0.00017047396008759754,
      "loss": 0.0436,
      "step": 500
    },
    {
      "epoch": 5.862068965517241,
      "grad_norm": 0.4095667898654938,
      "learning_rate": 0.00016910626489868649,
      "loss": 0.0408,
      "step": 510
    },
    {
      "epoch": 5.977011494252873,
      "grad_norm": 0.377605140209198,
      "learning_rate": 0.00016771336555363418,
      "loss": 0.0415,
      "step": 520
    },
    {
      "epoch": 6.091954022988506,
      "grad_norm": 0.28248798847198486,
      "learning_rate": 0.00016629577006503009,
      "loss": 0.0386,
      "step": 530
    },
    {
      "epoch": 6.206896551724138,
      "grad_norm": 0.36199840903282166,
      "learning_rate": 0.0001648539954525409,
      "loss": 0.0405,
      "step": 540
    },
    {
      "epoch": 6.32183908045977,
      "grad_norm": 0.2778664529323578,
      "learning_rate": 0.00016338856755434503,
      "loss": 0.0359,
      "step": 550
    },
    {
      "epoch": 6.436781609195402,
      "grad_norm": 0.23418012261390686,
      "learning_rate": 0.00016190002083535122,
      "loss": 0.0382,
      "step": 560
    },
    {
      "epoch": 6.551724137931035,
      "grad_norm": 0.3027312457561493,
      "learning_rate": 0.00016038889819227045,
      "loss": 0.0394,
      "step": 570
    },
    {
      "epoch": 6.666666666666667,
      "grad_norm": 0.2858007550239563,
      "learning_rate": 0.00015885575075561326,
      "loss": 0.042,
      "step": 580
    },
    {
      "epoch": 6.781609195402299,
      "grad_norm": 0.2762337923049927,
      "learning_rate": 0.00015730113768868312,
      "loss": 0.039,
      "step": 590
    },
    {
      "epoch": 6.896551724137931,
      "grad_norm": 0.40732237696647644,
      "learning_rate": 0.0001557256259836412,
      "loss": 0.0404,
      "step": 600
    },
    {
      "epoch": 7.011494252873563,
      "grad_norm": 0.36847805976867676,
      "learning_rate": 0.00015412979025471488,
      "loss": 0.0368,
      "step": 610
    },
    {
      "epoch": 7.126436781609195,
      "grad_norm": 0.2492237538099289,
      "learning_rate": 0.00015251421252862707,
      "loss": 0.0336,
      "step": 620
    },
    {
      "epoch": 7.241379310344827,
      "grad_norm": 0.2626156210899353,
      "learning_rate": 0.00015087948203232156,
      "loss": 0.0352,
      "step": 630
    },
    {
      "epoch": 7.35632183908046,
      "grad_norm": 0.6365396976470947,
      "learning_rate": 0.00014922619497806277,
      "loss": 0.0342,
      "step": 640
    },
    {
      "epoch": 7.471264367816092,
      "grad_norm": 0.3000635802745819,
      "learning_rate": 0.00014755495434598745,
      "loss": 0.037,
      "step": 650
    },
    {
      "epoch": 7.586206896551724,
      "grad_norm": 0.21869853138923645,
      "learning_rate": 0.0001458663696641884,
      "loss": 0.0365,
      "step": 660
    },
    {
      "epoch": 7.7011494252873565,
      "grad_norm": 0.22284150123596191,
      "learning_rate": 0.0001441610567864096,
      "loss": 0.035,
      "step": 670
    },
    {
      "epoch": 7.816091954022989,
      "grad_norm": 0.267621248960495,
      "learning_rate": 0.00014243963766743495,
      "loss": 0.029,
      "step": 680
    },
    {
      "epoch": 7.931034482758621,
      "grad_norm": 0.2817297875881195,
      "learning_rate": 0.00014070274013625096,
      "loss": 0.0303,
      "step": 690
    },
    {
      "epoch": 8.045977011494253,
      "grad_norm": 0.3712492287158966,
      "learning_rate": 0.00013895099766706784,
      "loss": 0.0297,
      "step": 700
    },
    {
      "epoch": 8.160919540229886,
      "grad_norm": 0.4549995958805084,
      "learning_rate": 0.00013718504914828135,
      "loss": 0.033,
      "step": 710
    },
    {
      "epoch": 8.275862068965518,
      "grad_norm": 0.28695234656333923,
      "learning_rate": 0.00013540553864945976,
      "loss": 0.0306,
      "step": 720
    },
    {
      "epoch": 8.39080459770115,
      "grad_norm": 0.34577062726020813,
      "learning_rate": 0.00013361311518644172,
      "loss": 0.0325,
      "step": 730
    },
    {
      "epoch": 8.505747126436782,
      "grad_norm": 0.3214464783668518,
      "learning_rate": 0.00013180843248462973,
      "loss": 0.0337,
      "step": 740
    },
    {
      "epoch": 8.620689655172415,
      "grad_norm": 0.33310961723327637,
      "learning_rate": 0.00012999214874056595,
      "loss": 0.0344,
      "step": 750
    },
    {
      "epoch": 8.735632183908045,
      "grad_norm": 0.25606226921081543,
      "learning_rate": 0.00012816492638187762,
      "loss": 0.0396,
      "step": 760
    },
    {
      "epoch": 8.850574712643677,
      "grad_norm": 0.36330148577690125,
      "learning_rate": 0.00012632743182567905,
      "loss": 0.0348,
      "step": 770
    },
    {
      "epoch": 8.96551724137931,
      "grad_norm": 0.314394474029541,
      "learning_rate": 0.00012448033523551865,
      "loss": 0.0399,
      "step": 780
    },
    {
      "epoch": 9.080459770114942,
      "grad_norm": 0.28129351139068604,
      "learning_rate": 0.00012262431027695964,
      "loss": 0.0298,
      "step": 790
    },
    {
      "epoch": 9.195402298850574,
      "grad_norm": 0.256881982088089,
      "learning_rate": 0.00012076003387188353,
      "loss": 0.0292,
      "step": 800
    },
    {
      "epoch": 9.310344827586206,
      "grad_norm": 0.1919921189546585,
      "learning_rate": 0.00011888818595160584,
      "loss": 0.0269,
      "step": 810
    },
    {
      "epoch": 9.425287356321839,
      "grad_norm": 0.2719796895980835,
      "learning_rate": 0.00011700944920889436,
      "loss": 0.0265,
      "step": 820
    },
    {
      "epoch": 9.540229885057471,
      "grad_norm": 0.2269754707813263,
      "learning_rate": 0.00011512450884898022,
      "loss": 0.0316,
      "step": 830
    },
    {
      "epoch": 9.655172413793103,
      "grad_norm": 0.23504453897476196,
      "learning_rate": 0.00011323405233965256,
      "loss": 0.0273,
      "step": 840
    },
    {
      "epoch": 9.770114942528735,
      "grad_norm": 0.22570957243442535,
      "learning_rate": 0.00011133876916052821,
      "loss": 0.0304,
      "step": 850
    },
    {
      "epoch": 9.885057471264368,
      "grad_norm": 0.19824576377868652,
      "learning_rate": 0.00010943935055158734,
      "loss": 0.0283,
      "step": 860
    },
    {
      "epoch": 10.0,
      "grad_norm": 0.41852012276649475,
      "learning_rate": 0.00010753648926106723,
      "loss": 0.0319,
      "step": 870
    },
    {
      "epoch": 10.114942528735632,
      "grad_norm": 0.20548714697360992,
      "learning_rate": 0.00010563087929280613,
      "loss": 0.0285,
      "step": 880
    },
    {
      "epoch": 10.229885057471265,
      "grad_norm": 0.22767336666584015,
      "learning_rate": 0.00010372321565312872,
      "loss": 0.031,
      "step": 890
    },
    {
      "epoch": 10.344827586206897,
      "grad_norm": 0.20542040467262268,
      "learning_rate": 0.00010181419409736647,
      "loss": 0.0316,
      "step": 900
    },
    {
      "epoch": 10.459770114942529,
      "grad_norm": 0.3105849027633667,
      "learning_rate": 9.990451087610448e-05,
      "loss": 0.027,
      "step": 910
    },
    {
      "epoch": 10.574712643678161,
      "grad_norm": 0.31816890835762024,
      "learning_rate": 9.799486248124775e-05,
      "loss": 0.025,
      "step": 920
    },
    {
      "epoch": 10.689655172413794,
      "grad_norm": 0.3295416235923767,
      "learning_rate": 9.608594539199957e-05,
      "loss": 0.0247,
      "step": 930
    },
    {
      "epoch": 10.804597701149426,
      "grad_norm": 0.17071272432804108,
      "learning_rate": 9.417845582084448e-05,
      "loss": 0.0291,
      "step": 940
    },
    {
      "epoch": 10.919540229885058,
      "grad_norm": 0.189552441239357,
      "learning_rate": 9.227308945962827e-05,
      "loss": 0.0243,
      "step": 950
    },
    {
      "epoch": 11.03448275862069,
      "grad_norm": 0.3179641664028168,
      "learning_rate": 9.037054122582839e-05,
      "loss": 0.0308,
      "step": 960
    },
    {
      "epoch": 11.149425287356323,
      "grad_norm": 0.3051457703113556,
      "learning_rate": 8.847150500910618e-05,
      "loss": 0.0275,
      "step": 970
    },
    {
      "epoch": 11.264367816091955,
      "grad_norm": 0.29757606983184814,
      "learning_rate": 8.657667341823448e-05,
      "loss": 0.0264,
      "step": 980
    },
    {
      "epoch": 11.379310344827585,
      "grad_norm": 0.2845855951309204,
      "learning_rate": 8.4686737528492e-05,
      "loss": 0.0249,
      "step": 990
    },
    {
      "epoch": 11.494252873563218,
      "grad_norm": 0.239737406373024,
      "learning_rate": 8.280238662961728e-05,
      "loss": 0.027,
      "step": 1000
    },
    {
      "epoch": 11.60919540229885,
      "grad_norm": 0.2692360281944275,
      "learning_rate": 8.092430797441364e-05,
      "loss": 0.0216,
      "step": 1010
    },
    {
      "epoch": 11.724137931034482,
      "grad_norm": 0.18495500087738037,
      "learning_rate": 7.905318652809728e-05,
      "loss": 0.0255,
      "step": 1020
    },
    {
      "epoch": 11.839080459770114,
      "grad_norm": 0.2230875939130783,
      "learning_rate": 7.718970471847923e-05,
      "loss": 0.0262,
      "step": 1030
    },
    {
      "epoch": 11.954022988505747,
      "grad_norm": 0.14376775920391083,
      "learning_rate": 7.53345421870735e-05,
      "loss": 0.0209,
      "step": 1040
    },
    {
      "epoch": 12.068965517241379,
      "grad_norm": 0.20623371005058289,
      "learning_rate": 7.348837554122057e-05,
      "loss": 0.0192,
      "step": 1050
    },
    {
      "epoch": 12.183908045977011,
      "grad_norm": 0.27209600806236267,
      "learning_rate": 7.165187810731823e-05,
      "loss": 0.0208,
      "step": 1060
    },
    {
      "epoch": 12.298850574712644,
      "grad_norm": 0.19447851181030273,
      "learning_rate": 6.982571968524847e-05,
      "loss": 0.0201,
      "step": 1070
    },
    {
      "epoch": 12.413793103448276,
      "grad_norm": 0.18613241612911224,
      "learning_rate": 6.801056630409098e-05,
      "loss": 0.0248,
      "step": 1080
    },
    {
      "epoch": 12.528735632183908,
      "grad_norm": 0.24156583845615387,
      "learning_rate": 6.620707997921157e-05,
      "loss": 0.0197,
      "step": 1090
    },
    {
      "epoch": 12.64367816091954,
      "grad_norm": 0.16912145912647247,
      "learning_rate": 6.441591847081476e-05,
      "loss": 0.022,
      "step": 1100
    },
    {
      "epoch": 12.758620689655173,
      "grad_norm": 0.14165754616260529,
      "learning_rate": 6.263773504404801e-05,
      "loss": 0.0199,
      "step": 1110
    },
    {
      "epoch": 12.873563218390805,
      "grad_norm": 0.3424724340438843,
      "learning_rate": 6.087317823074565e-05,
      "loss": 0.0209,
      "step": 1120
    },
    {
      "epoch": 12.988505747126437,
      "grad_norm": 0.2658204138278961,
      "learning_rate": 5.912289159289883e-05,
      "loss": 0.0242,
      "step": 1130
    },
    {
      "epoch": 13.10344827586207,
      "grad_norm": 0.21321730315685272,
      "learning_rate": 5.7387513487938386e-05,
      "loss": 0.0216,
      "step": 1140
    },
    {
      "epoch": 13.218390804597702,
      "grad_norm": 0.2854823172092438,
      "learning_rate": 5.566767683591553e-05,
      "loss": 0.0227,
      "step": 1150
    },
    {
      "epoch": 13.333333333333334,
      "grad_norm": 0.28919658064842224,
      "learning_rate": 5.396400888866601e-05,
      "loss": 0.0195,
      "step": 1160
    },
    {
      "epoch": 13.448275862068966,
      "grad_norm": 0.22510255873203278,
      "learning_rate": 5.2277131001041125e-05,
      "loss": 0.0241,
      "step": 1170
    },
    {
      "epoch": 13.563218390804598,
      "grad_norm": 0.21545900404453278,
      "learning_rate": 5.060765840429e-05,
      "loss": 0.023,
      "step": 1180
    },
    {
      "epoch": 13.678160919540229,
      "grad_norm": 0.20618782937526703,
      "learning_rate": 4.8956199981674656e-05,
      "loss": 0.0181,
      "step": 1190
    },
    {
      "epoch": 13.793103448275861,
      "grad_norm": 0.22331970930099487,
      "learning_rate": 4.7323358046400844e-05,
      "loss": 0.0212,
      "step": 1200
    },
    {
      "epoch": 13.908045977011493,
      "grad_norm": 0.14791706204414368,
      "learning_rate": 4.570972812194457e-05,
      "loss": 0.0195,
      "step": 1210
    },
    {
      "epoch": 14.022988505747126,
      "grad_norm": 0.1526448130607605,
      "learning_rate": 4.4115898724855455e-05,
      "loss": 0.0188,
      "step": 1220
    },
    {
      "epoch": 14.137931034482758,
      "grad_norm": 0.18956783413887024,
      "learning_rate": 4.254245115011506e-05,
      "loss": 0.0188,
      "step": 1230
    },
    {
      "epoch": 14.25287356321839,
      "grad_norm": 0.1313301920890808,
      "learning_rate": 4.098995925912972e-05,
      "loss": 0.019,
      "step": 1240
    },
    {
      "epoch": 14.367816091954023,
      "grad_norm": 0.13764789700508118,
      "learning_rate": 3.945898927043372e-05,
      "loss": 0.0175,
      "step": 1250
    },
    {
      "epoch": 14.482758620689655,
      "grad_norm": 0.19556942582130432,
      "learning_rate": 3.7950099553180804e-05,
      "loss": 0.0196,
      "step": 1260
    },
    {
      "epoch": 14.597701149425287,
      "grad_norm": 0.14027345180511475,
      "learning_rate": 3.646384042349764e-05,
      "loss": 0.0177,
      "step": 1270
    },
    {
      "epoch": 14.71264367816092,
      "grad_norm": 0.2918284833431244,
      "learning_rate": 3.500075394377511e-05,
      "loss": 0.0204,
      "step": 1280
    },
    {
      "epoch": 14.827586206896552,
      "grad_norm": 0.12948164343833923,
      "learning_rate": 3.3561373724969224e-05,
      "loss": 0.0188,
      "step": 1290
    },
    {
      "epoch": 14.942528735632184,
      "grad_norm": 0.1773224174976349,
      "learning_rate": 3.214622473198492e-05,
      "loss": 0.0212,
      "step": 1300
    },
    {
      "epoch": 15.057471264367816,
      "grad_norm": 0.29863160848617554,
      "learning_rate": 3.075582309221289e-05,
      "loss": 0.0157,
      "step": 1310
    },
    {
      "epoch": 15.172413793103448,
      "grad_norm": 0.18764474987983704,
      "learning_rate": 2.939067590728972e-05,
      "loss": 0.0175,
      "step": 1320
    },
    {
      "epoch": 15.28735632183908,
      "grad_norm": 0.16273990273475647,
      "learning_rate": 2.8051281068149803e-05,
      "loss": 0.0135,
      "step": 1330
    },
    {
      "epoch": 15.402298850574713,
      "grad_norm": 0.25088945031166077,
      "learning_rate": 2.673812707343669e-05,
      "loss": 0.0242,
      "step": 1340
    },
    {
      "epoch": 15.517241379310345,
      "grad_norm": 0.25521960854530334,
      "learning_rate": 2.545169285133965e-05,
      "loss": 0.0188,
      "step": 1350
    },
    {
      "epoch": 15.632183908045977,
      "grad_norm": 0.15780223906040192,
      "learning_rate": 2.4192447584921195e-05,
      "loss": 0.0194,
      "step": 1360
    },
    {
      "epoch": 15.74712643678161,
      "grad_norm": 0.13658417761325836,
      "learning_rate": 2.296085054099828e-05,
      "loss": 0.0179,
      "step": 1370
    },
    {
      "epoch": 15.862068965517242,
      "grad_norm": 0.14593394100666046,
      "learning_rate": 2.175735090264058e-05,
      "loss": 0.016,
      "step": 1380
    },
    {
      "epoch": 15.977011494252874,
      "grad_norm": 0.20093883574008942,
      "learning_rate": 2.0582387605346088e-05,
      "loss": 0.0157,
      "step": 1390
    },
    {
      "epoch": 16.091954022988507,
      "grad_norm": 0.22261527180671692,
      "learning_rate": 1.943638917695453e-05,
      "loss": 0.0175,
      "step": 1400
    },
    {
      "epoch": 16.20689655172414,
      "grad_norm": 0.17486433684825897,
      "learning_rate": 1.831977358135625e-05,
      "loss": 0.0166,
      "step": 1410
    },
    {
      "epoch": 16.32183908045977,
      "grad_norm": 0.2138216346502304,
      "learning_rate": 1.723294806605428e-05,
      "loss": 0.0146,
      "step": 1420
    },
    {
      "epoch": 16.436781609195403,
      "grad_norm": 0.20112960040569305,
      "learning_rate": 1.6176309013634517e-05,
      "loss": 0.0159,
      "step": 1430
    },
    {
      "epoch": 16.551724137931036,
      "grad_norm": 0.15377485752105713,
      "learning_rate": 1.5150241797198883e-05,
      "loss": 0.016,
      "step": 1440
    },
    {
      "epoch": 16.666666666666668,
      "grad_norm": 0.23132722079753876,
      "learning_rate": 1.415512063981339e-05,
      "loss": 0.0134,
      "step": 1450
    },
    {
      "epoch": 16.7816091954023,
      "grad_norm": 0.15262600779533386,
      "learning_rate": 1.3191308478023212e-05,
      "loss": 0.017,
      "step": 1460
    },
    {
      "epoch": 16.896551724137932,
      "grad_norm": 0.0991855040192604,
      "learning_rate": 1.2259156829483842e-05,
      "loss": 0.0167,
      "step": 1470
    },
    {
      "epoch": 17.011494252873565,
      "grad_norm": 0.12278055399656296,
      "learning_rate": 1.1359005664756994e-05,
      "loss": 0.0146,
      "step": 1480
    },
    {
      "epoch": 17.126436781609197,
      "grad_norm": 0.17124158143997192,
      "learning_rate": 1.0491183283317997e-05,
      "loss": 0.0173,
      "step": 1490
    },
    {
      "epoch": 17.24137931034483,
      "grad_norm": 0.1300356686115265,
      "learning_rate": 9.656006193819633e-06,
      "loss": 0.0143,
      "step": 1500
    },
    {
      "epoch": 17.35632183908046,
      "grad_norm": 0.17946338653564453,
      "learning_rate": 8.853778998656537e-06,
      "loss": 0.0154,
      "step": 1510
    },
    {
      "epoch": 17.47126436781609,
      "grad_norm": 0.28736400604248047,
      "learning_rate": 8.084794282871689e-06,
      "loss": 0.0166,
      "step": 1520
    },
    {
      "epoch": 17.586206896551722,
      "grad_norm": 0.13112574815750122,
      "learning_rate": 7.3493325074461165e-06,
      "loss": 0.0132,
      "step": 1530
    },
    {
      "epoch": 17.701149425287355,
      "grad_norm": 0.12864838540554047,
      "learning_rate": 6.647661907010183e-06,
      "loss": 0.0171,
      "step": 1540
    },
    {
      "epoch": 17.816091954022987,
      "grad_norm": 0.16958807408809662,
      "learning_rate": 5.980038392014309e-06,
      "loss": 0.0161,
      "step": 1550
    },
    {
      "epoch": 17.93103448275862,
      "grad_norm": 0.36121729016304016,
      "learning_rate": 5.3467054553941405e-06,
      "loss": 0.0158,
      "step": 1560
    },
    {
      "epoch": 18.04597701149425,
      "grad_norm": 0.2107989490032196,
      "learning_rate": 4.7478940837649924e-06,
      "loss": 0.0147,
      "step": 1570
    },
    {
      "epoch": 18.160919540229884,
      "grad_norm": 0.15654149651527405,
      "learning_rate": 4.183822673177229e-06,
      "loss": 0.0164,
      "step": 1580
    },
    {
      "epoch": 18.275862068965516,
      "grad_norm": 0.1438828557729721,
      "learning_rate": 3.6546969494637986e-06,
      "loss": 0.0131,
      "step": 1590
    },
    {
      "epoch": 18.39080459770115,
      "grad_norm": 0.2543192207813263,
      "learning_rate": 3.16070989320868e-06,
      "loss": 0.0157,
      "step": 1600
    },
    {
      "epoch": 18.50574712643678,
      "grad_norm": 0.13453112542629242,
      "learning_rate": 2.702041669363875e-06,
      "loss": 0.017,
      "step": 1610
    },
    {
      "epoch": 18.620689655172413,
      "grad_norm": 0.16369780898094177,
      "learning_rate": 2.2788595615403474e-06,
      "loss": 0.0157,
      "step": 1620
    },
    {
      "epoch": 18.735632183908045,
      "grad_norm": 0.14639818668365479,
      "learning_rate": 1.8913179109969482e-06,
      "loss": 0.0122,
      "step": 1630
    },
    {
      "epoch": 18.850574712643677,
      "grad_norm": 0.23813354969024658,
      "learning_rate": 1.5395580603498328e-06,
      "loss": 0.0157,
      "step": 1640
    },
    {
      "epoch": 18.96551724137931,
      "grad_norm": 0.15577834844589233,
      "learning_rate": 1.2237083020224526e-06,
      "loss": 0.0144,
      "step": 1650
    },
    {
      "epoch": 19.080459770114942,
      "grad_norm": 0.06880059838294983,
      "learning_rate": 9.438838314553056e-07,
      "loss": 0.0109,
      "step": 1660
    },
    {
      "epoch": 19.195402298850574,
      "grad_norm": 0.19819270074367523,
      "learning_rate": 7.001867050923095e-07,
      "loss": 0.0134,
      "step": 1670
    },
    {
      "epoch": 19.310344827586206,
      "grad_norm": 0.10673543065786362,
      "learning_rate": 4.92705803159188e-07,
      "loss": 0.0155,
      "step": 1680
    },
    {
      "epoch": 19.42528735632184,
      "grad_norm": 0.16529639065265656,
      "learning_rate": 3.2151679724748975e-07,
      "loss": 0.0175,
      "step": 1690
    },
    {
      "epoch": 19.54022988505747,
      "grad_norm": 0.1206677109003067,
      "learning_rate": 1.8668212271585327e-07,
      "loss": 0.0188,
      "step": 1700
    },
    {
      "epoch": 19.655172413793103,
      "grad_norm": 0.10180158913135529,
      "learning_rate": 8.825095591891152e-08,
      "loss": 0.0158,
      "step": 1710
    },
    {
      "epoch": 19.770114942528735,
      "grad_norm": 0.14544282853603363,
      "learning_rate": 2.625919627188278e-08,
      "loss": 0.015,
      "step": 1720
    },
    {
      "epoch": 19.885057471264368,
      "grad_norm": 0.14054107666015625,
      "learning_rate": 7.294531574553176e-10,
      "loss": 0.0139,
      "step": 1730
    },
    {
      "epoch": 19.908045977011493,
      "step": 1732,
      "total_flos": 2.431760592612004e+17,
      "train_loss": 0.04785273781403314,
      "train_runtime": 1980.9382,
      "train_samples_per_second": 55.957,
      "train_steps_per_second": 0.874
    }
  ],
  "logging_steps": 10,
  "max_steps": 1732,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 20,
  "save_steps": 10000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 2.431760592612004e+17,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}