{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 19.654545454545456,
  "eval_steps": 500,
  "global_step": 1081,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.18181818181818182,
      "grad_norm": 3.9825470447540283,
      "learning_rate": 3.6363636363636364e-05,
      "loss": 1.293,
      "step": 10
    },
    {
      "epoch": 0.36363636363636365,
      "grad_norm": 1.8670121431350708,
      "learning_rate": 7.272727272727273e-05,
      "loss": 0.4981,
      "step": 20
    },
    {
      "epoch": 0.5454545454545454,
      "grad_norm": 1.4379267692565918,
      "learning_rate": 0.00010909090909090909,
      "loss": 0.2739,
      "step": 30
    },
    {
      "epoch": 0.7272727272727273,
      "grad_norm": 0.9473206400871277,
      "learning_rate": 0.00014545454545454546,
      "loss": 0.2078,
      "step": 40
    },
    {
      "epoch": 0.9090909090909091,
      "grad_norm": 0.6841241717338562,
      "learning_rate": 0.00018181818181818183,
      "loss": 0.1781,
      "step": 50
    },
    {
      "epoch": 1.0909090909090908,
      "grad_norm": 1.1733323335647583,
      "learning_rate": 0.000199988280568259,
      "loss": 0.1661,
      "step": 60
    },
    {
      "epoch": 1.2727272727272727,
      "grad_norm": 0.7865139245986938,
      "learning_rate": 0.0001998945415950969,
      "loss": 0.1498,
      "step": 70
    },
    {
      "epoch": 1.4545454545454546,
      "grad_norm": 0.9522141218185425,
      "learning_rate": 0.00019970715152902254,
      "loss": 0.1335,
      "step": 80
    },
    {
      "epoch": 1.6363636363636362,
      "grad_norm": 0.7283738851547241,
      "learning_rate": 0.00019942628604814825,
      "loss": 0.1292,
      "step": 90
    },
    {
      "epoch": 1.8181818181818183,
      "grad_norm": 0.6728459596633911,
      "learning_rate": 0.00019905220846375032,
      "loss": 0.1078,
      "step": 100
    },
    {
      "epoch": 2.0,
      "grad_norm": 1.5674101114273071,
      "learning_rate": 0.00019858526947341497,
      "loss": 0.1197,
      "step": 110
    },
    {
      "epoch": 2.1818181818181817,
      "grad_norm": 0.42472681403160095,
      "learning_rate": 0.00019802590683225946,
      "loss": 0.1154,
      "step": 120
    },
    {
      "epoch": 2.3636363636363638,
      "grad_norm": 0.3803574740886688,
      "learning_rate": 0.0001973746449425368,
      "loss": 0.1068,
      "step": 130
    },
    {
      "epoch": 2.5454545454545454,
      "grad_norm": 0.685002863407135,
      "learning_rate": 0.00019663209436200887,
      "loss": 0.1054,
      "step": 140
    },
    {
      "epoch": 2.7272727272727275,
      "grad_norm": 0.8086279034614563,
      "learning_rate": 0.0001957989512315489,
      "loss": 0.101,
      "step": 150
    },
    {
      "epoch": 2.909090909090909,
      "grad_norm": 0.3693373501300812,
      "learning_rate": 0.00019487599662250943,
      "loss": 0.0981,
      "step": 160
    },
    {
      "epoch": 3.090909090909091,
      "grad_norm": 0.5383622646331787,
      "learning_rate": 0.00019386409580446844,
      "loss": 0.0847,
      "step": 170
    },
    {
      "epoch": 3.2727272727272725,
      "grad_norm": 0.49988943338394165,
      "learning_rate": 0.00019276419743403933,
      "loss": 0.0979,
      "step": 180
    },
    {
      "epoch": 3.4545454545454546,
      "grad_norm": 0.4996686577796936,
      "learning_rate": 0.00019157733266550575,
      "loss": 0.0865,
      "step": 190
    },
    {
      "epoch": 3.6363636363636362,
      "grad_norm": 0.6447169780731201,
      "learning_rate": 0.00019030461418411497,
      "loss": 0.0822,
      "step": 200
    },
    {
      "epoch": 3.8181818181818183,
      "grad_norm": 0.4973439574241638,
      "learning_rate": 0.00018894723516293583,
      "loss": 0.0896,
      "step": 210
    },
    {
      "epoch": 4.0,
      "grad_norm": 1.349974513053894,
      "learning_rate": 0.00018750646814425938,
      "loss": 0.0844,
      "step": 220
    },
    {
      "epoch": 4.181818181818182,
      "grad_norm": 0.6695556640625,
      "learning_rate": 0.0001859836638465911,
      "loss": 0.0832,
      "step": 230
    },
    {
      "epoch": 4.363636363636363,
      "grad_norm": 0.5756475925445557,
      "learning_rate": 0.0001843802498983529,
      "loss": 0.0867,
      "step": 240
    },
    {
      "epoch": 4.545454545454545,
      "grad_norm": 0.334103524684906,
      "learning_rate": 0.00018269772949948182,
      "loss": 0.0794,
      "step": 250
    },
    {
      "epoch": 4.7272727272727275,
      "grad_norm": 0.7557447552680969,
      "learning_rate": 0.00018093768001218094,
      "loss": 0.085,
      "step": 260
    },
    {
      "epoch": 4.909090909090909,
      "grad_norm": 0.37030595541000366,
      "learning_rate": 0.00017910175148214274,
      "loss": 0.079,
      "step": 270
    },
    {
      "epoch": 5.090909090909091,
      "grad_norm": 0.7280672192573547,
      "learning_rate": 0.0001771916650916321,
      "loss": 0.0738,
      "step": 280
    },
    {
      "epoch": 5.2727272727272725,
      "grad_norm": 0.6517002582550049,
      "learning_rate": 0.00017520921154587843,
      "loss": 0.0718,
      "step": 290
    },
    {
      "epoch": 5.454545454545454,
      "grad_norm": 0.40652868151664734,
      "learning_rate": 0.00017315624939429037,
      "loss": 0.072,
      "step": 300
    },
    {
      "epoch": 5.636363636363637,
      "grad_norm": 0.5963271260261536,
      "learning_rate": 0.0001710347032880664,
      "loss": 0.0786,
      "step": 310
    },
    {
      "epoch": 5.818181818181818,
      "grad_norm": 0.356381356716156,
      "learning_rate": 0.00016884656217583518,
      "loss": 0.0578,
      "step": 320
    },
    {
      "epoch": 6.0,
      "grad_norm": 1.6636348962783813,
      "learning_rate": 0.00016659387743901685,
      "loss": 0.0893,
      "step": 330
    },
    {
      "epoch": 6.181818181818182,
      "grad_norm": 0.42870157957077026,
      "learning_rate": 0.00016427876096865394,
      "loss": 0.0714,
      "step": 340
    },
    {
      "epoch": 6.363636363636363,
      "grad_norm": 0.32923629879951477,
      "learning_rate": 0.00016190338318551427,
      "loss": 0.0695,
      "step": 350
    },
    {
      "epoch": 6.545454545454545,
      "grad_norm": 0.491409569978714,
      "learning_rate": 0.0001594699710053223,
      "loss": 0.0797,
      "step": 360
    },
    {
      "epoch": 6.7272727272727275,
      "grad_norm": 0.3972729742527008,
      "learning_rate": 0.00015698080575102661,
      "loss": 0.0697,
      "step": 370
    },
    {
      "epoch": 6.909090909090909,
      "grad_norm": 0.4414363205432892,
      "learning_rate": 0.00015443822101406064,
      "loss": 0.0591,
      "step": 380
    },
    {
      "epoch": 7.090909090909091,
      "grad_norm": 0.3927883207798004,
      "learning_rate": 0.00015184460046660137,
      "loss": 0.0665,
      "step": 390
    },
    {
      "epoch": 7.2727272727272725,
      "grad_norm": 0.39698049426078796,
      "learning_rate": 0.00014920237562687785,
      "loss": 0.0626,
      "step": 400
    },
    {
      "epoch": 7.454545454545454,
      "grad_norm": 0.7567592263221741,
      "learning_rate": 0.00014651402357962367,
      "loss": 0.0604,
      "step": 410
    },
    {
      "epoch": 7.636363636363637,
      "grad_norm": 0.5023426413536072,
      "learning_rate": 0.0001437820646538112,
      "loss": 0.0613,
      "step": 420
    },
    {
      "epoch": 7.818181818181818,
      "grad_norm": 0.38268569111824036,
      "learning_rate": 0.00014100906005984403,
      "loss": 0.0607,
      "step": 430
    },
    {
      "epoch": 8.0,
      "grad_norm": 0.9016662836074829,
      "learning_rate": 0.0001381976094884232,
      "loss": 0.0592,
      "step": 440
    },
    {
      "epoch": 8.181818181818182,
      "grad_norm": 0.5547723770141602,
      "learning_rate": 0.00013535034867333837,
      "loss": 0.0586,
      "step": 450
    },
    {
      "epoch": 8.363636363636363,
      "grad_norm": 0.44761940836906433,
      "learning_rate": 0.00013246994692046836,
      "loss": 0.0579,
      "step": 460
    },
    {
      "epoch": 8.545454545454545,
      "grad_norm": 0.24242262542247772,
      "learning_rate": 0.00012955910460530788,
      "loss": 0.0539,
      "step": 470
    },
    {
      "epoch": 8.727272727272727,
      "grad_norm": 0.5021650195121765,
      "learning_rate": 0.00012662055064136668,
      "loss": 0.0564,
      "step": 480
    },
    {
      "epoch": 8.909090909090908,
      "grad_norm": 0.37628263235092163,
      "learning_rate": 0.00012365703992181425,
      "loss": 0.0522,
      "step": 490
    },
    {
      "epoch": 9.090909090909092,
      "grad_norm": 0.3592880964279175,
      "learning_rate": 0.0001206713507367684,
      "loss": 0.0539,
      "step": 500
    },
    {
      "epoch": 9.272727272727273,
      "grad_norm": 0.3467582166194916,
      "learning_rate": 0.0001176662821686496,
      "loss": 0.0575,
      "step": 510
    },
    {
      "epoch": 9.454545454545455,
      "grad_norm": 0.3472346067428589,
      "learning_rate": 0.00011464465146804217,
      "loss": 0.0602,
      "step": 520
    },
    {
      "epoch": 9.636363636363637,
      "grad_norm": 0.2870403826236725,
      "learning_rate": 0.00011160929141252303,
      "loss": 0.0524,
      "step": 530
    },
    {
      "epoch": 9.818181818181818,
      "grad_norm": 0.2755221128463745,
      "learning_rate": 0.0001085630476509339,
      "loss": 0.0536,
      "step": 540
    },
    {
      "epoch": 10.0,
      "grad_norm": 1.0921339988708496,
      "learning_rate": 0.00010550877603558655,
      "loss": 0.05,
      "step": 550
    },
    {
      "epoch": 10.181818181818182,
      "grad_norm": 0.2905895411968231,
      "learning_rate": 0.00010244933994490249,
      "loss": 0.0469,
      "step": 560
    },
    {
      "epoch": 10.363636363636363,
      "grad_norm": 0.4015822112560272,
      "learning_rate": 9.938760759899674e-05,
      "loss": 0.0519,
      "step": 570
    },
    {
      "epoch": 10.545454545454545,
      "grad_norm": 0.26584434509277344,
      "learning_rate": 9.632644937072277e-05,
      "loss": 0.0527,
      "step": 580
    },
    {
      "epoch": 10.727272727272727,
      "grad_norm": 0.29837700724601746,
      "learning_rate": 9.326873509469887e-05,
      "loss": 0.0506,
      "step": 590
    },
    {
      "epoch": 10.909090909090908,
      "grad_norm": 0.3558896780014038,
      "learning_rate": 9.021733137683962e-05,
      "loss": 0.0441,
      "step": 600
    },
    {
      "epoch": 11.090909090909092,
      "grad_norm": 0.2896372377872467,
      "learning_rate": 8.717509890691368e-05,
      "loss": 0.0428,
      "step": 610
    },
    {
      "epoch": 11.272727272727273,
      "grad_norm": 0.5100614428520203,
      "learning_rate": 8.414488977664859e-05,
      "loss": 0.0421,
      "step": 620
    },
    {
      "epoch": 11.454545454545455,
      "grad_norm": 0.3731076121330261,
      "learning_rate": 8.112954480589558e-05,
      "loss": 0.0431,
      "step": 630
    },
    {
      "epoch": 11.636363636363637,
      "grad_norm": 0.2851674258708954,
      "learning_rate": 7.813189087936243e-05,
      "loss": 0.0431,
      "step": 640
    },
    {
      "epoch": 11.818181818181818,
      "grad_norm": 0.3725188672542572,
      "learning_rate": 7.515473829640987e-05,
      "loss": 0.0427,
      "step": 650
    },
    {
      "epoch": 12.0,
      "grad_norm": 1.0623624324798584,
      "learning_rate": 7.220087813639736e-05,
      "loss": 0.0485,
      "step": 660
    },
    {
      "epoch": 12.181818181818182,
      "grad_norm": 0.3354561924934387,
      "learning_rate": 6.927307964204694e-05,
      "loss": 0.042,
      "step": 670
    },
    {
      "epoch": 12.363636363636363,
      "grad_norm": 0.2896369695663452,
      "learning_rate": 6.637408762327972e-05,
      "loss": 0.0398,
      "step": 680
    },
    {
      "epoch": 12.545454545454545,
      "grad_norm": 0.24495282769203186,
      "learning_rate": 6.350661988395723e-05,
      "loss": 0.0366,
      "step": 690
    },
    {
      "epoch": 12.727272727272727,
      "grad_norm": 0.3516719937324524,
      "learning_rate": 6.067336467394169e-05,
      "loss": 0.0355,
      "step": 700
    },
    {
      "epoch": 12.909090909090908,
      "grad_norm": 0.22353091835975647,
      "learning_rate": 5.787697816886273e-05,
      "loss": 0.0365,
      "step": 710
    },
    {
      "epoch": 13.090909090909092,
      "grad_norm": 0.3955240249633789,
      "learning_rate": 5.5120081979953785e-05,
      "loss": 0.033,
      "step": 720
    },
    {
      "epoch": 13.272727272727273,
      "grad_norm": 0.35612088441848755,
      "learning_rate": 5.240526069629265e-05,
      "loss": 0.0448,
      "step": 730
    },
    {
      "epoch": 13.454545454545455,
      "grad_norm": 0.3462859094142914,
      "learning_rate": 4.97350594617502e-05,
      "loss": 0.0418,
      "step": 740
    },
    {
      "epoch": 13.636363636363637,
      "grad_norm": 0.29914817214012146,
      "learning_rate": 4.7111981588919084e-05,
      "loss": 0.0412,
      "step": 750
    },
    {
      "epoch": 13.818181818181818,
      "grad_norm": 0.3142814338207245,
      "learning_rate": 4.453848621225912e-05,
      "loss": 0.0368,
      "step": 760
    },
    {
      "epoch": 14.0,
      "grad_norm": 0.4887177348136902,
      "learning_rate": 4.201698598265973e-05,
      "loss": 0.0387,
      "step": 770
    },
    {
      "epoch": 14.181818181818182,
      "grad_norm": 0.19729413092136383,
      "learning_rate": 3.9549844805580706e-05,
      "loss": 0.0362,
      "step": 780
    },
    {
      "epoch": 14.363636363636363,
      "grad_norm": 0.3818693161010742,
      "learning_rate": 3.713937562489179e-05,
      "loss": 0.0425,
      "step": 790
    },
    {
      "epoch": 14.545454545454545,
      "grad_norm": 0.21857142448425293,
      "learning_rate": 3.4787838254488694e-05,
      "loss": 0.035,
      "step": 800
    },
    {
      "epoch": 14.727272727272727,
      "grad_norm": 0.26334837079048157,
      "learning_rate": 3.249743725971849e-05,
      "loss": 0.0332,
      "step": 810
    },
    {
      "epoch": 14.909090909090908,
      "grad_norm": 0.13971690833568573,
      "learning_rate": 3.0270319890600462e-05,
      "loss": 0.0362,
      "step": 820
    },
    {
      "epoch": 15.090909090909092,
      "grad_norm": 0.3234981298446655,
      "learning_rate": 2.810857406878009e-05,
      "loss": 0.0288,
      "step": 830
    },
    {
      "epoch": 15.272727272727273,
      "grad_norm": 0.29610171914100647,
      "learning_rate": 2.601422643010335e-05,
      "loss": 0.0363,
      "step": 840
    },
    {
      "epoch": 15.454545454545455,
      "grad_norm": 1.4063012599945068,
      "learning_rate": 2.3989240424646355e-05,
      "loss": 0.0322,
      "step": 850
    },
    {
      "epoch": 15.636363636363637,
      "grad_norm": 0.261489599943161,
      "learning_rate": 2.2035514475981756e-05,
      "loss": 0.037,
      "step": 860
    },
    {
      "epoch": 15.818181818181818,
      "grad_norm": 0.21999020874500275,
      "learning_rate": 2.0154880201407367e-05,
      "loss": 0.0319,
      "step": 870
    },
    {
      "epoch": 16.0,
      "grad_norm": 2.1982412338256836,
      "learning_rate": 1.834910069480571e-05,
      "loss": 0.0571,
      "step": 880
    },
    {
      "epoch": 16.181818181818183,
      "grad_norm": 0.295622855424881,
      "learning_rate": 1.6619868873744147e-05,
      "loss": 0.0313,
      "step": 890
    },
    {
      "epoch": 16.363636363636363,
      "grad_norm": 0.266347736120224,
      "learning_rate": 1.49688058923654e-05,
      "loss": 0.034,
      "step": 900
    },
    {
      "epoch": 16.545454545454547,
      "grad_norm": 0.3452344238758087,
      "learning_rate": 1.339745962155613e-05,
      "loss": 0.0391,
      "step": 910
    },
    {
      "epoch": 16.727272727272727,
      "grad_norm": 0.21851502358913422,
      "learning_rate": 1.1907303197818665e-05,
      "loss": 0.0307,
      "step": 920
    },
    {
      "epoch": 16.90909090909091,
      "grad_norm": 0.4546089470386505,
      "learning_rate": 1.0499733642206033e-05,
      "loss": 0.0273,
      "step": 930
    },
    {
      "epoch": 17.09090909090909,
      "grad_norm": 0.22212770581245422,
      "learning_rate": 9.176070550615378e-06,
      "loss": 0.0305,
      "step": 940
    },
    {
      "epoch": 17.272727272727273,
      "grad_norm": 0.24768735468387604,
      "learning_rate": 7.937554856667196e-06,
      "loss": 0.0333,
      "step": 950
    },
    {
      "epoch": 17.454545454545453,
      "grad_norm": 0.19499576091766357,
      "learning_rate": 6.785347668330777e-06,
      "loss": 0.0263,
      "step": 960
    },
    {
      "epoch": 17.636363636363637,
      "grad_norm": 0.16682682931423187,
      "learning_rate": 5.720529179385659e-06,
      "loss": 0.0381,
      "step": 970
    },
    {
      "epoch": 17.818181818181817,
      "grad_norm": 0.223773792386055,
      "learning_rate": 4.744097656740709e-06,
      "loss": 0.0312,
      "step": 980
    },
    {
      "epoch": 18.0,
      "grad_norm": 0.8798514604568481,
      "learning_rate": 3.856968504558989e-06,
      "loss": 0.0277,
      "step": 990
    },
    {
      "epoch": 18.181818181818183,
      "grad_norm": 0.18742257356643677,
      "learning_rate": 3.059973406066963e-06,
      "loss": 0.0288,
      "step": 1000
    },
    {
      "epoch": 18.363636363636363,
      "grad_norm": 0.19658038020133972,
      "learning_rate": 2.353859543851644e-06,
      "loss": 0.0378,
      "step": 1010
    },
    {
      "epoch": 18.545454545454547,
      "grad_norm": 0.18045732378959656,
      "learning_rate": 1.7392888993773005e-06,
      "loss": 0.0247,
      "step": 1020
    },
    {
      "epoch": 18.727272727272727,
      "grad_norm": 0.28070148825645447,
      "learning_rate": 1.216837632378065e-06,
      "loss": 0.0314,
      "step": 1030
    },
    {
      "epoch": 18.90909090909091,
      "grad_norm": 0.20971472561359406,
      "learning_rate": 7.86995540708424e-07,
      "loss": 0.0278,
      "step": 1040
    },
    {
      "epoch": 19.09090909090909,
      "grad_norm": 0.1779453009366989,
      "learning_rate": 4.501656011579036e-07,
      "loss": 0.0445,
      "step": 1050
    },
    {
      "epoch": 19.272727272727273,
      "grad_norm": 0.2201903909444809,
      "learning_rate": 2.066635916605386e-07,
      "loss": 0.0247,
      "step": 1060
    },
    {
      "epoch": 19.454545454545453,
      "grad_norm": 0.17639388144016266,
      "learning_rate": 5.6717795253113935e-08,
      "loss": 0.025,
      "step": 1070
    },
    {
      "epoch": 19.636363636363637,
      "grad_norm": 0.2432723492383957,
      "learning_rate": 4.687860599927873e-10,
      "loss": 0.0306,
      "step": 1080
    },
    {
      "epoch": 19.654545454545456,
      "step": 1081,
      "total_flos": 1.40579546183604e+17,
      "train_loss": 0.0782834448637122,
      "train_runtime": 1256.8447,
      "train_samples_per_second": 55.046,
      "train_steps_per_second": 0.86
    }
  ],
  "logging_steps": 10,
  "max_steps": 1081,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 20,
  "save_steps": 10000,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1.40579546183604e+17,
  "train_batch_size": 64,
  "trial_name": null,
  "trial_params": null
}