{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.016675351745700884,
  "eval_steps": 50,
  "global_step": 200,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 8.337675872850443e-05,
      "grad_norm": 2.534695863723755,
      "learning_rate": 2e-05,
      "loss": 6.996,
      "step": 1
    },
    {
      "epoch": 8.337675872850443e-05,
      "eval_loss": 2.2557170391082764,
      "eval_runtime": 69.6462,
      "eval_samples_per_second": 72.509,
      "eval_steps_per_second": 36.255,
      "step": 1
    },
    {
      "epoch": 0.00016675351745700886,
      "grad_norm": 4.112269878387451,
      "learning_rate": 4e-05,
      "loss": 10.7356,
      "step": 2
    },
    {
      "epoch": 0.0002501302761855133,
      "grad_norm": 4.108906269073486,
      "learning_rate": 6e-05,
      "loss": 9.3082,
      "step": 3
    },
    {
      "epoch": 0.0003335070349140177,
      "grad_norm": 3.1692538261413574,
      "learning_rate": 8e-05,
      "loss": 9.1058,
      "step": 4
    },
    {
      "epoch": 0.00041688379364252214,
      "grad_norm": 5.433767318725586,
      "learning_rate": 0.0001,
      "loss": 13.4658,
      "step": 5
    },
    {
      "epoch": 0.0005002605523710266,
      "grad_norm": 3.1914710998535156,
      "learning_rate": 0.00012,
      "loss": 8.0176,
      "step": 6
    },
    {
      "epoch": 0.000583637311099531,
      "grad_norm": 3.6264748573303223,
      "learning_rate": 0.00014,
      "loss": 9.5218,
      "step": 7
    },
    {
      "epoch": 0.0006670140698280355,
      "grad_norm": 3.173428773880005,
      "learning_rate": 0.00016,
      "loss": 8.7686,
      "step": 8
    },
    {
      "epoch": 0.0007503908285565399,
      "grad_norm": 3.8662312030792236,
      "learning_rate": 0.00018,
      "loss": 9.6705,
      "step": 9
    },
    {
      "epoch": 0.0008337675872850443,
      "grad_norm": 2.918992280960083,
      "learning_rate": 0.0002,
      "loss": 7.5198,
      "step": 10
    },
    {
      "epoch": 0.0009171443460135487,
      "grad_norm": 2.782639741897583,
      "learning_rate": 0.0001999863304992469,
      "loss": 8.8055,
      "step": 11
    },
    {
      "epoch": 0.0010005211047420532,
      "grad_norm": 2.9841339588165283,
      "learning_rate": 0.00019994532573409262,
      "loss": 8.0065,
      "step": 12
    },
    {
      "epoch": 0.0010838978634705575,
      "grad_norm": 4.018250942230225,
      "learning_rate": 0.00019987699691483048,
      "loss": 8.9573,
      "step": 13
    },
    {
      "epoch": 0.001167274622199062,
      "grad_norm": 3.4327125549316406,
      "learning_rate": 0.00019978136272187747,
      "loss": 9.973,
      "step": 14
    },
    {
      "epoch": 0.0012506513809275664,
      "grad_norm": 3.299730062484741,
      "learning_rate": 0.000199658449300667,
      "loss": 7.8683,
      "step": 15
    },
    {
      "epoch": 0.001334028139656071,
      "grad_norm": 3.753506898880005,
      "learning_rate": 0.00019950829025450114,
      "loss": 7.2567,
      "step": 16
    },
    {
      "epoch": 0.0014174048983845752,
      "grad_norm": 4.432242393493652,
      "learning_rate": 0.00019933092663536382,
      "loss": 9.6064,
      "step": 17
    },
    {
      "epoch": 0.0015007816571130797,
      "grad_norm": 6.0634684562683105,
      "learning_rate": 0.00019912640693269752,
      "loss": 10.0913,
      "step": 18
    },
    {
      "epoch": 0.0015841584158415843,
      "grad_norm": 5.204840660095215,
      "learning_rate": 0.00019889478706014687,
      "loss": 8.775,
      "step": 19
    },
    {
      "epoch": 0.0016675351745700886,
      "grad_norm": 5.777037620544434,
      "learning_rate": 0.00019863613034027224,
      "loss": 9.278,
      "step": 20
    },
    {
      "epoch": 0.001750911933298593,
      "grad_norm": 4.261403560638428,
      "learning_rate": 0.00019835050748723824,
      "loss": 9.6213,
      "step": 21
    },
    {
      "epoch": 0.0018342886920270974,
      "grad_norm": 4.248034954071045,
      "learning_rate": 0.00019803799658748094,
      "loss": 9.3568,
      "step": 22
    },
    {
      "epoch": 0.001917665450755602,
      "grad_norm": 3.803323268890381,
      "learning_rate": 0.00019769868307835994,
      "loss": 8.48,
      "step": 23
    },
    {
      "epoch": 0.0020010422094841065,
      "grad_norm": 3.443775177001953,
      "learning_rate": 0.0001973326597248006,
      "loss": 8.5166,
      "step": 24
    },
    {
      "epoch": 0.0020844189682126106,
      "grad_norm": 3.10636568069458,
      "learning_rate": 0.00019694002659393305,
      "loss": 6.9142,
      "step": 25
    },
    {
      "epoch": 0.002167795726941115,
      "grad_norm": 7.100295543670654,
      "learning_rate": 0.00019652089102773488,
      "loss": 11.6384,
      "step": 26
    },
    {
      "epoch": 0.0022511724856696196,
      "grad_norm": 5.785402774810791,
      "learning_rate": 0.00019607536761368484,
      "loss": 10.7454,
      "step": 27
    },
    {
      "epoch": 0.002334549244398124,
      "grad_norm": 3.018549680709839,
      "learning_rate": 0.00019560357815343577,
      "loss": 7.6228,
      "step": 28
    },
    {
      "epoch": 0.0024179260031266287,
      "grad_norm": 4.010953426361084,
      "learning_rate": 0.00019510565162951537,
      "loss": 9.4401,
      "step": 29
    },
    {
      "epoch": 0.0025013027618551328,
      "grad_norm": 4.524446964263916,
      "learning_rate": 0.00019458172417006347,
      "loss": 10.1375,
      "step": 30
    },
    {
      "epoch": 0.0025846795205836373,
      "grad_norm": 3.461047887802124,
      "learning_rate": 0.00019403193901161613,
      "loss": 8.033,
      "step": 31
    },
    {
      "epoch": 0.002668056279312142,
      "grad_norm": 3.3522675037384033,
      "learning_rate": 0.0001934564464599461,
      "loss": 8.2424,
      "step": 32
    },
    {
      "epoch": 0.0027514330380406463,
      "grad_norm": 2.893357515335083,
      "learning_rate": 0.00019285540384897073,
      "loss": 7.1057,
      "step": 33
    },
    {
      "epoch": 0.0028348097967691504,
      "grad_norm": 3.0926976203918457,
      "learning_rate": 0.00019222897549773848,
      "loss": 8.0821,
      "step": 34
    },
    {
      "epoch": 0.002918186555497655,
      "grad_norm": 3.8426332473754883,
      "learning_rate": 0.00019157733266550575,
      "loss": 10.5508,
      "step": 35
    },
    {
      "epoch": 0.0030015633142261595,
      "grad_norm": 4.10872220993042,
      "learning_rate": 0.00019090065350491626,
      "loss": 10.314,
      "step": 36
    },
    {
      "epoch": 0.003084940072954664,
      "grad_norm": 2.7273058891296387,
      "learning_rate": 0.00019019912301329592,
      "loss": 8.1275,
      "step": 37
    },
    {
      "epoch": 0.0031683168316831685,
      "grad_norm": 3.5605056285858154,
      "learning_rate": 0.00018947293298207635,
      "loss": 8.6723,
      "step": 38
    },
    {
      "epoch": 0.0032516935904116726,
      "grad_norm": 5.416788578033447,
      "learning_rate": 0.0001887222819443612,
      "loss": 9.7495,
      "step": 39
    },
    {
      "epoch": 0.003335070349140177,
      "grad_norm": 3.8840010166168213,
      "learning_rate": 0.0001879473751206489,
      "loss": 8.8682,
      "step": 40
    },
    {
      "epoch": 0.0034184471078686817,
      "grad_norm": 3.141036033630371,
      "learning_rate": 0.00018714842436272773,
      "loss": 7.8402,
      "step": 41
    },
    {
      "epoch": 0.003501823866597186,
      "grad_norm": 3.415287971496582,
      "learning_rate": 0.00018632564809575742,
      "loss": 7.4874,
      "step": 42
    },
    {
      "epoch": 0.0035852006253256903,
      "grad_norm": 6.077539443969727,
      "learning_rate": 0.0001854792712585539,
      "loss": 10.3711,
      "step": 43
    },
    {
      "epoch": 0.003668577384054195,
      "grad_norm": 4.345938682556152,
      "learning_rate": 0.00018460952524209355,
      "loss": 10.1496,
      "step": 44
    },
    {
      "epoch": 0.0037519541427826993,
      "grad_norm": 4.29965353012085,
      "learning_rate": 0.00018371664782625287,
      "loss": 9.2361,
      "step": 45
    },
    {
      "epoch": 0.003835330901511204,
      "grad_norm": 4.321191787719727,
      "learning_rate": 0.00018280088311480201,
      "loss": 8.8282,
      "step": 46
    },
    {
      "epoch": 0.003918707660239708,
      "grad_norm": 3.4017069339752197,
      "learning_rate": 0.00018186248146866927,
      "loss": 8.8426,
      "step": 47
    },
    {
      "epoch": 0.004002084418968213,
      "grad_norm": 3.803649425506592,
      "learning_rate": 0.00018090169943749476,
      "loss": 7.1382,
      "step": 48
    },
    {
      "epoch": 0.004085461177696717,
      "grad_norm": 5.015726566314697,
      "learning_rate": 0.0001799187996894925,
      "loss": 9.7273,
      "step": 49
    },
    {
      "epoch": 0.004168837936425221,
      "grad_norm": 5.0045037269592285,
      "learning_rate": 0.00017891405093963938,
      "loss": 10.0517,
      "step": 50
    },
    {
      "epoch": 0.004168837936425221,
      "eval_loss": 2.041832447052002,
      "eval_runtime": 69.7121,
      "eval_samples_per_second": 72.441,
      "eval_steps_per_second": 36.22,
      "step": 50
    },
    {
      "epoch": 0.004252214695153726,
      "grad_norm": 5.273454666137695,
      "learning_rate": 0.00017788772787621126,
      "loss": 7.7355,
      "step": 51
    },
    {
      "epoch": 0.00433559145388223,
      "grad_norm": 4.412261962890625,
      "learning_rate": 0.00017684011108568592,
      "loss": 6.7413,
      "step": 52
    },
    {
      "epoch": 0.004418968212610735,
      "grad_norm": 3.984342098236084,
      "learning_rate": 0.0001757714869760335,
      "loss": 8.8219,
      "step": 53
    },
    {
      "epoch": 0.004502344971339239,
      "grad_norm": 3.3683133125305176,
      "learning_rate": 0.0001746821476984154,
      "loss": 7.4397,
      "step": 54
    },
    {
      "epoch": 0.004585721730067743,
      "grad_norm": 4.496111869812012,
      "learning_rate": 0.00017357239106731317,
      "loss": 6.8065,
      "step": 55
    },
    {
      "epoch": 0.004669098488796248,
      "grad_norm": 4.162015438079834,
      "learning_rate": 0.00017244252047910892,
      "loss": 8.1048,
      "step": 56
    },
    {
      "epoch": 0.004752475247524752,
      "grad_norm": 3.4676620960235596,
      "learning_rate": 0.00017129284482913972,
      "loss": 6.8288,
      "step": 57
    },
    {
      "epoch": 0.004835852006253257,
      "grad_norm": 4.051041603088379,
      "learning_rate": 0.00017012367842724887,
      "loss": 7.3718,
      "step": 58
    },
    {
      "epoch": 0.004919228764981761,
      "grad_norm": 5.180737018585205,
      "learning_rate": 0.0001689353409118566,
      "loss": 9.8836,
      "step": 59
    },
    {
      "epoch": 0.0050026055237102655,
      "grad_norm": 5.89562463760376,
      "learning_rate": 0.00016772815716257412,
      "loss": 8.8932,
      "step": 60
    },
    {
      "epoch": 0.0050859822824387705,
      "grad_norm": 4.652097702026367,
      "learning_rate": 0.0001665024572113848,
      "loss": 7.3464,
      "step": 61
    },
    {
      "epoch": 0.0051693590411672746,
      "grad_norm": 3.2940993309020996,
      "learning_rate": 0.00016525857615241687,
      "loss": 6.9377,
      "step": 62
    },
    {
      "epoch": 0.005252735799895779,
      "grad_norm": 4.341916561126709,
      "learning_rate": 0.00016399685405033167,
      "loss": 9.9174,
      "step": 63
    },
    {
      "epoch": 0.005336112558624284,
      "grad_norm": 6.677371501922607,
      "learning_rate": 0.0001627176358473537,
      "loss": 10.8274,
      "step": 64
    },
    {
      "epoch": 0.005419489317352788,
      "grad_norm": 3.8812875747680664,
      "learning_rate": 0.0001614212712689668,
      "loss": 7.6091,
      "step": 65
    },
    {
      "epoch": 0.005502866076081293,
      "grad_norm": 5.111429691314697,
      "learning_rate": 0.00016010811472830252,
      "loss": 6.8907,
      "step": 66
    },
    {
      "epoch": 0.005586242834809797,
      "grad_norm": 3.911635398864746,
      "learning_rate": 0.00015877852522924732,
      "loss": 7.5746,
      "step": 67
    },
    {
      "epoch": 0.005669619593538301,
      "grad_norm": 4.402072906494141,
      "learning_rate": 0.00015743286626829437,
      "loss": 7.3162,
      "step": 68
    },
    {
      "epoch": 0.005752996352266806,
      "grad_norm": 4.469682216644287,
      "learning_rate": 0.0001560715057351673,
      "loss": 8.1245,
      "step": 69
    },
    {
      "epoch": 0.00583637311099531,
      "grad_norm": 5.504849910736084,
      "learning_rate": 0.00015469481581224272,
      "loss": 10.3321,
      "step": 70
    },
    {
      "epoch": 0.005919749869723815,
      "grad_norm": 5.990438938140869,
      "learning_rate": 0.0001533031728727994,
      "loss": 8.4619,
      "step": 71
    },
    {
      "epoch": 0.006003126628452319,
      "grad_norm": 3.5529396533966064,
      "learning_rate": 0.00015189695737812152,
      "loss": 7.236,
      "step": 72
    },
    {
      "epoch": 0.006086503387180823,
      "grad_norm": 5.105240345001221,
      "learning_rate": 0.0001504765537734844,
      "loss": 11.3329,
      "step": 73
    },
    {
      "epoch": 0.006169880145909328,
      "grad_norm": 3.823227882385254,
      "learning_rate": 0.00014904235038305083,
      "loss": 8.5633,
      "step": 74
    },
    {
      "epoch": 0.006253256904637832,
      "grad_norm": 4.1277971267700195,
      "learning_rate": 0.00014759473930370736,
      "loss": 8.9986,
      "step": 75
    },
    {
      "epoch": 0.006336633663366337,
      "grad_norm": 5.1016740798950195,
      "learning_rate": 0.0001461341162978688,
      "loss": 6.7809,
      "step": 76
    },
    {
      "epoch": 0.006420010422094841,
      "grad_norm": 5.213894367218018,
      "learning_rate": 0.00014466088068528068,
      "loss": 8.1234,
      "step": 77
    },
    {
      "epoch": 0.006503387180823345,
      "grad_norm": 3.9442667961120605,
      "learning_rate": 0.00014317543523384928,
      "loss": 6.803,
      "step": 78
    },
    {
      "epoch": 0.00658676393955185,
      "grad_norm": 4.572739601135254,
      "learning_rate": 0.00014167818604952906,
      "loss": 9.0664,
      "step": 79
    },
    {
      "epoch": 0.006670140698280354,
      "grad_norm": 5.51597785949707,
      "learning_rate": 0.00014016954246529696,
      "loss": 8.7253,
      "step": 80
    },
    {
      "epoch": 0.006753517457008858,
      "grad_norm": 6.085229396820068,
      "learning_rate": 0.00013864991692924523,
      "loss": 7.5132,
      "step": 81
    },
    {
      "epoch": 0.006836894215737363,
      "grad_norm": 5.175082683563232,
      "learning_rate": 0.00013711972489182208,
      "loss": 6.8016,
      "step": 82
    },
    {
      "epoch": 0.0069202709744658674,
      "grad_norm": 3.689929723739624,
      "learning_rate": 0.00013557938469225167,
      "loss": 8.3562,
      "step": 83
    },
    {
      "epoch": 0.007003647733194372,
      "grad_norm": 3.6427206993103027,
      "learning_rate": 0.00013402931744416433,
      "loss": 7.3571,
      "step": 84
    },
    {
      "epoch": 0.0070870244919228765,
      "grad_norm": 4.614701271057129,
      "learning_rate": 0.00013246994692046836,
      "loss": 8.9938,
      "step": 85
    },
    {
      "epoch": 0.007170401250651381,
      "grad_norm": 4.688872337341309,
      "learning_rate": 0.00013090169943749476,
      "loss": 7.112,
      "step": 86
    },
    {
      "epoch": 0.0072537780093798855,
      "grad_norm": 3.395167350769043,
      "learning_rate": 0.0001293250037384465,
      "loss": 9.9337,
      "step": 87
    },
    {
      "epoch": 0.00733715476810839,
      "grad_norm": 4.059149265289307,
      "learning_rate": 0.00012774029087618446,
      "loss": 8.5826,
      "step": 88
    },
    {
      "epoch": 0.007420531526836895,
      "grad_norm": 3.590693473815918,
      "learning_rate": 0.00012614799409538198,
      "loss": 6.5018,
      "step": 89
    },
    {
      "epoch": 0.007503908285565399,
      "grad_norm": 4.000766277313232,
      "learning_rate": 0.00012454854871407994,
      "loss": 7.8792,
      "step": 90
    },
    {
      "epoch": 0.007587285044293903,
      "grad_norm": 5.370626449584961,
      "learning_rate": 0.00012294239200467516,
      "loss": 9.4348,
      "step": 91
    },
    {
      "epoch": 0.007670661803022408,
      "grad_norm": 4.013746738433838,
      "learning_rate": 0.0001213299630743747,
      "loss": 7.533,
      "step": 92
    },
    {
      "epoch": 0.007754038561750912,
      "grad_norm": 5.8706583976745605,
      "learning_rate": 0.00011971170274514802,
      "loss": 8.9689,
      "step": 93
    },
    {
      "epoch": 0.007837415320479416,
      "grad_norm": 5.318309783935547,
      "learning_rate": 0.000118088053433211,
      "loss": 11.444,
      "step": 94
    },
    {
      "epoch": 0.007920792079207921,
      "grad_norm": 5.957444667816162,
      "learning_rate": 0.00011645945902807341,
      "loss": 9.4652,
      "step": 95
    },
    {
      "epoch": 0.008004168837936426,
      "grad_norm": 4.867214679718018,
      "learning_rate": 0.0001148263647711842,
      "loss": 8.7648,
      "step": 96
    },
    {
      "epoch": 0.008087545596664929,
      "grad_norm": 4.044260501861572,
      "learning_rate": 0.00011318921713420691,
      "loss": 9.4336,
      "step": 97
    },
    {
      "epoch": 0.008170922355393434,
      "grad_norm": 5.8006205558776855,
      "learning_rate": 0.00011154846369695863,
      "loss": 8.8013,
      "step": 98
    },
    {
      "epoch": 0.008254299114121939,
      "grad_norm": 3.970647096633911,
      "learning_rate": 0.0001099045530250463,
      "loss": 8.509,
      "step": 99
    },
    {
      "epoch": 0.008337675872850442,
      "grad_norm": 2.7641403675079346,
      "learning_rate": 0.00010825793454723325,
      "loss": 5.7705,
      "step": 100
    },
    {
      "epoch": 0.008337675872850442,
      "eval_loss": 1.9884008169174194,
      "eval_runtime": 70.3896,
      "eval_samples_per_second": 71.744,
      "eval_steps_per_second": 35.872,
      "step": 100
    },
    {
      "epoch": 0.008421052631578947,
      "grad_norm": 4.176263332366943,
      "learning_rate": 0.00010660905843256994,
      "loss": 7.0963,
      "step": 101
    },
    {
      "epoch": 0.008504429390307452,
      "grad_norm": 3.327336549758911,
      "learning_rate": 0.00010495837546732224,
      "loss": 7.0791,
      "step": 102
    },
    {
      "epoch": 0.008587806149035955,
      "grad_norm": 5.326552391052246,
      "learning_rate": 0.00010330633693173082,
      "loss": 8.5557,
      "step": 103
    },
    {
      "epoch": 0.00867118290776446,
      "grad_norm": 3.137587785720825,
      "learning_rate": 0.00010165339447663587,
      "loss": 7.6346,
      "step": 104
    },
    {
      "epoch": 0.008754559666492965,
      "grad_norm": 4.366918087005615,
      "learning_rate": 0.0001,
      "loss": 7.8156,
      "step": 105
    },
    {
      "epoch": 0.00883793642522147,
      "grad_norm": 4.763668060302734,
      "learning_rate": 9.834660552336415e-05,
      "loss": 8.0528,
      "step": 106
    },
    {
      "epoch": 0.008921313183949973,
      "grad_norm": 3.8986947536468506,
      "learning_rate": 9.669366306826919e-05,
      "loss": 6.3366,
      "step": 107
    },
    {
      "epoch": 0.009004689942678478,
      "grad_norm": 5.50061559677124,
      "learning_rate": 9.504162453267777e-05,
      "loss": 7.7455,
      "step": 108
    },
    {
      "epoch": 0.009088066701406983,
      "grad_norm": 3.723641872406006,
      "learning_rate": 9.339094156743007e-05,
      "loss": 6.2653,
      "step": 109
    },
    {
      "epoch": 0.009171443460135487,
      "grad_norm": 3.923414945602417,
      "learning_rate": 9.174206545276677e-05,
      "loss": 7.2898,
      "step": 110
    },
    {
      "epoch": 0.009254820218863992,
      "grad_norm": 4.864738464355469,
      "learning_rate": 9.009544697495374e-05,
      "loss": 9.1495,
      "step": 111
    },
    {
      "epoch": 0.009338196977592497,
      "grad_norm": 4.508790016174316,
      "learning_rate": 8.845153630304139e-05,
      "loss": 7.5381,
      "step": 112
    },
    {
      "epoch": 0.009421573736321,
      "grad_norm": 5.183701515197754,
      "learning_rate": 8.681078286579311e-05,
      "loss": 8.099,
      "step": 113
    },
    {
      "epoch": 0.009504950495049505,
      "grad_norm": 5.300266742706299,
      "learning_rate": 8.517363522881579e-05,
      "loss": 9.3539,
      "step": 114
    },
    {
      "epoch": 0.00958832725377801,
      "grad_norm": 4.453273773193359,
      "learning_rate": 8.35405409719266e-05,
      "loss": 9.0837,
      "step": 115
    },
    {
      "epoch": 0.009671704012506515,
      "grad_norm": 5.1587910652160645,
      "learning_rate": 8.191194656678904e-05,
      "loss": 10.6395,
      "step": 116
    },
    {
      "epoch": 0.009755080771235018,
      "grad_norm": 4.783015727996826,
      "learning_rate": 8.028829725485199e-05,
      "loss": 7.0859,
      "step": 117
    },
    {
      "epoch": 0.009838457529963523,
      "grad_norm": 5.205554485321045,
      "learning_rate": 7.867003692562534e-05,
      "loss": 6.7606,
      "step": 118
    },
    {
      "epoch": 0.009921834288692028,
      "grad_norm": 3.5442142486572266,
      "learning_rate": 7.705760799532485e-05,
      "loss": 5.8093,
      "step": 119
    },
    {
      "epoch": 0.010005211047420531,
      "grad_norm": 4.767430305480957,
      "learning_rate": 7.54514512859201e-05,
      "loss": 8.8166,
      "step": 120
    },
    {
      "epoch": 0.010088587806149036,
      "grad_norm": 4.638835430145264,
      "learning_rate": 7.385200590461803e-05,
      "loss": 7.7852,
      "step": 121
    },
    {
      "epoch": 0.010171964564877541,
      "grad_norm": 3.8365132808685303,
      "learning_rate": 7.225970912381556e-05,
      "loss": 7.4408,
      "step": 122
    },
    {
      "epoch": 0.010255341323606044,
      "grad_norm": 3.970036506652832,
      "learning_rate": 7.067499626155354e-05,
      "loss": 8.0905,
      "step": 123
    },
    {
      "epoch": 0.010338718082334549,
      "grad_norm": 3.594309091567993,
      "learning_rate": 6.909830056250527e-05,
      "loss": 9.6643,
      "step": 124
    },
    {
      "epoch": 0.010422094841063054,
      "grad_norm": 4.544534206390381,
      "learning_rate": 6.753005307953167e-05,
      "loss": 7.2089,
      "step": 125
    },
    {
      "epoch": 0.010505471599791557,
      "grad_norm": 4.061487674713135,
      "learning_rate": 6.59706825558357e-05,
      "loss": 6.8838,
      "step": 126
    },
    {
      "epoch": 0.010588848358520062,
      "grad_norm": 4.6268510818481445,
      "learning_rate": 6.442061530774834e-05,
      "loss": 8.0996,
      "step": 127
    },
    {
      "epoch": 0.010672225117248567,
      "grad_norm": 4.591619491577148,
      "learning_rate": 6.28802751081779e-05,
      "loss": 9.0279,
      "step": 128
    },
    {
      "epoch": 0.010755601875977072,
      "grad_norm": 3.5984318256378174,
      "learning_rate": 6.135008307075481e-05,
      "loss": 6.8998,
      "step": 129
    },
    {
      "epoch": 0.010838978634705575,
      "grad_norm": 4.134429931640625,
      "learning_rate": 5.983045753470308e-05,
      "loss": 7.7342,
      "step": 130
    },
    {
      "epoch": 0.01092235539343408,
      "grad_norm": 4.544061660766602,
      "learning_rate": 5.832181395047098e-05,
      "loss": 6.6566,
      "step": 131
    },
    {
      "epoch": 0.011005732152162585,
      "grad_norm": 4.653716087341309,
      "learning_rate": 5.6824564766150726e-05,
      "loss": 8.1876,
      "step": 132
    },
    {
      "epoch": 0.011089108910891089,
      "grad_norm": 4.810211181640625,
      "learning_rate": 5.533911931471936e-05,
      "loss": 9.1589,
      "step": 133
    },
    {
      "epoch": 0.011172485669619594,
      "grad_norm": 4.453709125518799,
      "learning_rate": 5.386588370213124e-05,
      "loss": 8.7409,
      "step": 134
    },
    {
      "epoch": 0.011255862428348098,
      "grad_norm": 5.478562831878662,
      "learning_rate": 5.240526069629265e-05,
      "loss": 10.2675,
      "step": 135
    },
    {
      "epoch": 0.011339239187076602,
      "grad_norm": 4.585888862609863,
      "learning_rate": 5.095764961694922e-05,
      "loss": 7.5847,
      "step": 136
    },
    {
      "epoch": 0.011422615945805107,
      "grad_norm": 4.79464054107666,
      "learning_rate": 4.952344622651566e-05,
      "loss": 8.9001,
      "step": 137
    },
    {
      "epoch": 0.011505992704533612,
      "grad_norm": 5.097901344299316,
      "learning_rate": 4.810304262187852e-05,
      "loss": 9.4132,
      "step": 138
    },
    {
      "epoch": 0.011589369463262115,
      "grad_norm": 3.9933440685272217,
      "learning_rate": 4.669682712720065e-05,
      "loss": 7.8629,
      "step": 139
    },
    {
      "epoch": 0.01167274622199062,
      "grad_norm": 4.931108474731445,
      "learning_rate": 4.530518418775733e-05,
      "loss": 8.0418,
      "step": 140
    },
    {
      "epoch": 0.011756122980719125,
      "grad_norm": 4.090979099273682,
      "learning_rate": 4.392849426483274e-05,
      "loss": 7.5149,
      "step": 141
    },
    {
      "epoch": 0.01183949973944763,
      "grad_norm": 4.233511447906494,
      "learning_rate": 4.256713373170564e-05,
      "loss": 6.9281,
      "step": 142
    },
    {
      "epoch": 0.011922876498176133,
      "grad_norm": 3.975322723388672,
      "learning_rate": 4.12214747707527e-05,
      "loss": 7.2225,
      "step": 143
    },
    {
      "epoch": 0.012006253256904638,
      "grad_norm": 5.84738302230835,
      "learning_rate": 3.9891885271697496e-05,
      "loss": 9.5679,
      "step": 144
    },
    {
      "epoch": 0.012089630015633143,
      "grad_norm": 3.988117218017578,
      "learning_rate": 3.857872873103322e-05,
      "loss": 6.1582,
      "step": 145
    },
    {
      "epoch": 0.012173006774361646,
      "grad_norm": 3.905040740966797,
      "learning_rate": 3.7282364152646297e-05,
      "loss": 7.2591,
      "step": 146
    },
    {
      "epoch": 0.012256383533090151,
      "grad_norm": 5.4452924728393555,
      "learning_rate": 3.600314594966834e-05,
      "loss": 9.7908,
      "step": 147
    },
    {
      "epoch": 0.012339760291818656,
      "grad_norm": 5.016132354736328,
      "learning_rate": 3.4741423847583134e-05,
      "loss": 7.3294,
      "step": 148
    },
    {
      "epoch": 0.01242313705054716,
      "grad_norm": 4.367519855499268,
      "learning_rate": 3.349754278861517e-05,
      "loss": 9.6115,
      "step": 149
    },
    {
      "epoch": 0.012506513809275664,
      "grad_norm": 4.329085350036621,
      "learning_rate": 3.227184283742591e-05,
      "loss": 7.6718,
      "step": 150
    },
    {
      "epoch": 0.012506513809275664,
      "eval_loss": 1.9661880731582642,
      "eval_runtime": 70.4704,
      "eval_samples_per_second": 71.661,
      "eval_steps_per_second": 35.831,
      "step": 150
    },
    {
      "epoch": 0.01258989056800417,
      "grad_norm": 3.8532228469848633,
      "learning_rate": 3.106465908814342e-05,
      "loss": 9.0608,
      "step": 151
    },
    {
      "epoch": 0.012673267326732674,
      "grad_norm": 4.6140456199646,
      "learning_rate": 2.9876321572751144e-05,
      "loss": 7.3028,
      "step": 152
    },
    {
      "epoch": 0.012756644085461177,
      "grad_norm": 4.772500514984131,
      "learning_rate": 2.87071551708603e-05,
      "loss": 10.0175,
      "step": 153
    },
    {
      "epoch": 0.012840020844189682,
      "grad_norm": 4.909071445465088,
      "learning_rate": 2.7557479520891104e-05,
      "loss": 9.2292,
      "step": 154
    },
    {
      "epoch": 0.012923397602918187,
      "grad_norm": 4.347821235656738,
      "learning_rate": 2.6427608932686843e-05,
      "loss": 7.6785,
      "step": 155
    },
    {
      "epoch": 0.01300677436164669,
      "grad_norm": 5.958515644073486,
      "learning_rate": 2.5317852301584643e-05,
      "loss": 10.8612,
      "step": 156
    },
    {
      "epoch": 0.013090151120375195,
      "grad_norm": 4.036256790161133,
      "learning_rate": 2.422851302396655e-05,
      "loss": 6.5377,
      "step": 157
    },
    {
      "epoch": 0.0131735278791037,
      "grad_norm": 4.0767436027526855,
      "learning_rate": 2.315988891431412e-05,
      "loss": 6.9137,
      "step": 158
    },
    {
      "epoch": 0.013256904637832204,
      "grad_norm": 4.559920787811279,
      "learning_rate": 2.2112272123788768e-05,
      "loss": 7.8971,
      "step": 159
    },
    {
      "epoch": 0.013340281396560709,
      "grad_norm": 3.1472551822662354,
      "learning_rate": 2.1085949060360654e-05,
      "loss": 5.2795,
      "step": 160
    },
    {
      "epoch": 0.013423658155289214,
      "grad_norm": 4.643826961517334,
      "learning_rate": 2.008120031050753e-05,
      "loss": 10.4631,
      "step": 161
    },
    {
      "epoch": 0.013507034914017717,
      "grad_norm": 5.551768779754639,
      "learning_rate": 1.9098300562505266e-05,
      "loss": 8.5777,
      "step": 162
    },
    {
      "epoch": 0.013590411672746222,
      "grad_norm": 3.418463706970215,
      "learning_rate": 1.8137518531330767e-05,
      "loss": 5.9437,
      "step": 163
    },
    {
      "epoch": 0.013673788431474727,
      "grad_norm": 4.048105239868164,
      "learning_rate": 1.7199116885197995e-05,
      "loss": 7.9703,
      "step": 164
    },
    {
      "epoch": 0.013757165190203232,
      "grad_norm": 3.720338821411133,
      "learning_rate": 1.6283352173747145e-05,
      "loss": 7.5605,
      "step": 165
    },
    {
      "epoch": 0.013840541948931735,
      "grad_norm": 4.476045608520508,
      "learning_rate": 1.5390474757906446e-05,
      "loss": 7.0048,
      "step": 166
    },
    {
      "epoch": 0.01392391870766024,
      "grad_norm": 4.96762228012085,
      "learning_rate": 1.4520728741446089e-05,
      "loss": 10.7742,
      "step": 167
    },
    {
      "epoch": 0.014007295466388745,
      "grad_norm": 4.895275592803955,
      "learning_rate": 1.3674351904242611e-05,
      "loss": 6.773,
      "step": 168
    },
    {
      "epoch": 0.014090672225117248,
      "grad_norm": 5.70258903503418,
      "learning_rate": 1.2851575637272262e-05,
      "loss": 11.6797,
      "step": 169
    },
    {
      "epoch": 0.014174048983845753,
      "grad_norm": 3.3568615913391113,
      "learning_rate": 1.2052624879351104e-05,
      "loss": 6.4714,
      "step": 170
    },
    {
      "epoch": 0.014257425742574258,
      "grad_norm": 4.252028465270996,
      "learning_rate": 1.1277718055638819e-05,
      "loss": 7.7119,
      "step": 171
    },
    {
      "epoch": 0.014340802501302761,
      "grad_norm": 4.521555423736572,
      "learning_rate": 1.0527067017923654e-05,
      "loss": 7.6881,
      "step": 172
    },
    {
      "epoch": 0.014424179260031266,
      "grad_norm": 4.935907363891602,
      "learning_rate": 9.80087698670411e-06,
      "loss": 8.6496,
      "step": 173
    },
    {
      "epoch": 0.014507556018759771,
      "grad_norm": 3.695267915725708,
      "learning_rate": 9.09934649508375e-06,
      "loss": 7.9681,
      "step": 174
    },
    {
      "epoch": 0.014590932777488274,
      "grad_norm": 4.178027629852295,
      "learning_rate": 8.422667334494249e-06,
      "loss": 8.4342,
      "step": 175
    },
    {
      "epoch": 0.01467430953621678,
      "grad_norm": 4.292721748352051,
      "learning_rate": 7.771024502261526e-06,
      "loss": 8.0612,
      "step": 176
    },
    {
      "epoch": 0.014757686294945284,
      "grad_norm": 5.052070140838623,
      "learning_rate": 7.144596151029303e-06,
      "loss": 7.641,
      "step": 177
    },
    {
      "epoch": 0.01484106305367379,
      "grad_norm": 5.070157527923584,
      "learning_rate": 6.543553540053926e-06,
      "loss": 7.9055,
      "step": 178
    },
    {
      "epoch": 0.014924439812402292,
      "grad_norm": 4.510176658630371,
      "learning_rate": 5.968060988383883e-06,
      "loss": 6.882,
      "step": 179
    },
    {
      "epoch": 0.015007816571130797,
      "grad_norm": 4.591582775115967,
      "learning_rate": 5.418275829936537e-06,
      "loss": 7.9564,
      "step": 180
    },
    {
      "epoch": 0.015091193329859302,
      "grad_norm": 5.847330570220947,
      "learning_rate": 4.8943483704846475e-06,
      "loss": 9.3527,
      "step": 181
    },
    {
      "epoch": 0.015174570088587806,
      "grad_norm": 4.053606033325195,
      "learning_rate": 4.3964218465642355e-06,
      "loss": 8.5674,
      "step": 182
    },
    {
      "epoch": 0.01525794684731631,
      "grad_norm": 4.323848247528076,
      "learning_rate": 3.924632386315186e-06,
      "loss": 8.7715,
      "step": 183
    },
    {
      "epoch": 0.015341323606044815,
      "grad_norm": 4.508976459503174,
      "learning_rate": 3.4791089722651436e-06,
      "loss": 9.1552,
      "step": 184
    },
    {
      "epoch": 0.015424700364773319,
      "grad_norm": 3.246110439300537,
      "learning_rate": 3.059973406066963e-06,
      "loss": 8.1741,
      "step": 185
    },
    {
      "epoch": 0.015508077123501824,
      "grad_norm": 4.064354419708252,
      "learning_rate": 2.667340275199426e-06,
      "loss": 9.1723,
      "step": 186
    },
    {
      "epoch": 0.015591453882230329,
      "grad_norm": 4.16738748550415,
      "learning_rate": 2.3013169216400733e-06,
      "loss": 7.7564,
      "step": 187
    },
    {
      "epoch": 0.015674830640958832,
      "grad_norm": 3.58866810798645,
      "learning_rate": 1.9620034125190644e-06,
      "loss": 8.4719,
      "step": 188
    },
    {
      "epoch": 0.01575820739968734,
      "grad_norm": 4.435379505157471,
      "learning_rate": 1.6494925127617634e-06,
      "loss": 8.5871,
      "step": 189
    },
    {
      "epoch": 0.015841584158415842,
      "grad_norm": 4.594212532043457,
      "learning_rate": 1.3638696597277679e-06,
      "loss": 8.6829,
      "step": 190
    },
    {
      "epoch": 0.015924960917144345,
      "grad_norm": 5.113223552703857,
      "learning_rate": 1.1052129398531507e-06,
      "loss": 7.9665,
      "step": 191
    },
    {
      "epoch": 0.01600833767587285,
      "grad_norm": 3.7875256538391113,
      "learning_rate": 8.735930673024806e-07,
      "loss": 6.4576,
      "step": 192
    },
    {
      "epoch": 0.016091714434601355,
      "grad_norm": 3.661071300506592,
      "learning_rate": 6.690733646361857e-07,
      "loss": 6.8199,
      "step": 193
    },
    {
      "epoch": 0.016175091193329858,
      "grad_norm": 4.465787887573242,
      "learning_rate": 4.917097454988584e-07,
      "loss": 8.3244,
      "step": 194
    },
    {
      "epoch": 0.016258467952058365,
      "grad_norm": 4.833078861236572,
      "learning_rate": 3.415506993330153e-07,
      "loss": 8.3167,
      "step": 195
    },
    {
      "epoch": 0.016341844710786868,
      "grad_norm": 3.7651824951171875,
      "learning_rate": 2.1863727812254653e-07,
      "loss": 6.5716,
      "step": 196
    },
    {
      "epoch": 0.01642522146951537,
      "grad_norm": 4.892351150512695,
      "learning_rate": 1.230030851695263e-07,
      "loss": 7.0042,
      "step": 197
    },
    {
      "epoch": 0.016508598228243878,
      "grad_norm": 3.952244758605957,
      "learning_rate": 5.467426590739511e-08,
      "loss": 7.8461,
      "step": 198
    },
    {
      "epoch": 0.01659197498697238,
      "grad_norm": 3.891923427581787,
      "learning_rate": 1.3669500753099585e-08,
      "loss": 7.6043,
      "step": 199
    },
    {
      "epoch": 0.016675351745700884,
      "grad_norm": 5.590742588043213,
      "learning_rate": 0.0,
      "loss": 10.1178,
      "step": 200
    },
    {
      "epoch": 0.016675351745700884,
      "eval_loss": 1.9614673852920532,
      "eval_runtime": 69.1641,
      "eval_samples_per_second": 73.015,
      "eval_steps_per_second": 36.507,
      "step": 200
    }
  ],
  "logging_steps": 1,
  "max_steps": 200,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 1,
  "save_steps": 50,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": true
      },
      "attributes": {}
    }
  },
  "total_flos": 1821610856153088.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}