{
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.6817364986285822,
  "eval_steps": 133,
  "global_step": 1802,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "epoch": 0.0003783221413033198,
      "grad_norm": 56.01426433664828,
      "learning_rate": 1e-08,
      "loss": 8.5655,
      "step": 1
    },
    {
      "epoch": 0.0003783221413033198,
      "eval_loss": 8.416223526000977,
      "eval_runtime": 26.8642,
      "eval_samples_per_second": 32.944,
      "eval_steps_per_second": 1.042,
      "step": 1
    },
    {
      "epoch": 0.0003783221413033198,
      "eval_bench_accuracy_arc_challenge": 0.12857142857142856,
      "eval_bench_accuracy_hellaswag": 0.025,
      "eval_bench_accuracy_mmlu": 0.21739130434782608,
      "eval_bench_average_accuracy": 0.1236542443064182,
      "eval_bench_loss": 10.19635223924068,
      "eval_bench_total_accuracy": 0.1054945054945055,
      "step": 1
    },
    {
      "epoch": 0.0007566442826066396,
      "grad_norm": 52.75063804651517,
      "learning_rate": 2e-08,
      "loss": 8.4236,
      "step": 2
    },
    {
      "epoch": 0.0011349664239099593,
      "grad_norm": 54.29511008856074,
      "learning_rate": 3e-08,
      "loss": 8.5128,
      "step": 3
    },
    {
      "epoch": 0.0015132885652132792,
      "grad_norm": 50.84717091006242,
      "learning_rate": 4e-08,
      "loss": 8.368,
      "step": 4
    },
    {
      "epoch": 0.0018916107065165989,
      "grad_norm": 58.682276590467374,
      "learning_rate": 5e-08,
      "loss": 8.5171,
      "step": 5
    },
    {
      "epoch": 0.0022699328478199185,
      "grad_norm": 54.19973526319146,
      "learning_rate": 6e-08,
      "loss": 8.4329,
      "step": 6
    },
    {
      "epoch": 0.0026482549891232382,
      "grad_norm": 52.00177926668044,
      "learning_rate": 7e-08,
      "loss": 8.4562,
      "step": 7
    },
    {
      "epoch": 0.0030265771304265584,
      "grad_norm": 55.9652762703784,
      "learning_rate": 8e-08,
      "loss": 8.5017,
      "step": 8
    },
    {
      "epoch": 0.003404899271729878,
      "grad_norm": 54.88105368356734,
      "learning_rate": 9e-08,
      "loss": 8.471,
      "step": 9
    },
    {
      "epoch": 0.0037832214130331977,
      "grad_norm": 50.22661382824928,
      "learning_rate": 1e-07,
      "loss": 8.4042,
      "step": 10
    },
    {
      "epoch": 0.004161543554336518,
      "grad_norm": 51.712774406266966,
      "learning_rate": 1.0999999999999999e-07,
      "loss": 8.4819,
      "step": 11
    },
    {
      "epoch": 0.004539865695639837,
      "grad_norm": 44.20700801792938,
      "learning_rate": 1.2e-07,
      "loss": 8.2981,
      "step": 12
    },
    {
      "epoch": 0.004918187836943157,
      "grad_norm": 46.914384802444836,
      "learning_rate": 1.3e-07,
      "loss": 8.4152,
      "step": 13
    },
    {
      "epoch": 0.0052965099782464765,
      "grad_norm": 46.66045652280597,
      "learning_rate": 1.4e-07,
      "loss": 8.4776,
      "step": 14
    },
    {
      "epoch": 0.005674832119549797,
      "grad_norm": 45.99567071730722,
      "learning_rate": 1.5e-07,
      "loss": 8.4602,
      "step": 15
    },
    {
      "epoch": 0.006053154260853117,
      "grad_norm": 31.7220420827569,
      "learning_rate": 1.6e-07,
      "loss": 8.342,
      "step": 16
    },
    {
      "epoch": 0.006431476402156436,
      "grad_norm": 31.79821930177939,
      "learning_rate": 1.7000000000000001e-07,
      "loss": 8.4073,
      "step": 17
    },
    {
      "epoch": 0.006809798543459756,
      "grad_norm": 34.99852513062481,
      "learning_rate": 1.8e-07,
      "loss": 8.4475,
      "step": 18
    },
    {
      "epoch": 0.007188120684763075,
      "grad_norm": 32.34312521349501,
      "learning_rate": 1.8999999999999998e-07,
      "loss": 8.3691,
      "step": 19
    },
    {
      "epoch": 0.0075664428260663955,
      "grad_norm": 28.491575199383966,
      "learning_rate": 2e-07,
      "loss": 8.2467,
      "step": 20
    },
    {
      "epoch": 0.007944764967369716,
      "grad_norm": 27.788350456113577,
      "learning_rate": 2.0999999999999997e-07,
      "loss": 8.2619,
      "step": 21
    },
    {
      "epoch": 0.008323087108673036,
      "grad_norm": 23.054768686734494,
      "learning_rate": 2.1999999999999998e-07,
      "loss": 8.2719,
      "step": 22
    },
    {
      "epoch": 0.008701409249976354,
      "grad_norm": 20.862948070445295,
      "learning_rate": 2.3e-07,
      "loss": 8.1701,
      "step": 23
    },
    {
      "epoch": 0.009079731391279674,
      "grad_norm": 23.840305973367958,
      "learning_rate": 2.4e-07,
      "loss": 8.2447,
      "step": 24
    },
    {
      "epoch": 0.009458053532582994,
      "grad_norm": 22.407061285607927,
      "learning_rate": 2.5e-07,
      "loss": 8.2056,
      "step": 25
    },
    {
      "epoch": 0.009836375673886314,
      "grad_norm": 21.55132867797403,
      "learning_rate": 2.6e-07,
      "loss": 8.1552,
      "step": 26
    },
    {
      "epoch": 0.010214697815189635,
      "grad_norm": 20.992840710071967,
      "learning_rate": 2.7e-07,
      "loss": 8.188,
      "step": 27
    },
    {
      "epoch": 0.010593019956492953,
      "grad_norm": 22.39828627182125,
      "learning_rate": 2.8e-07,
      "loss": 8.1256,
      "step": 28
    },
    {
      "epoch": 0.010971342097796273,
      "grad_norm": 18.46346034557574,
      "learning_rate": 2.9e-07,
      "loss": 8.0045,
      "step": 29
    },
    {
      "epoch": 0.011349664239099593,
      "grad_norm": 12.704677816631309,
      "learning_rate": 3e-07,
      "loss": 8.0417,
      "step": 30
    },
    {
      "epoch": 0.011727986380402913,
      "grad_norm": 15.722346563574124,
      "learning_rate": 3.1e-07,
      "loss": 7.9647,
      "step": 31
    },
    {
      "epoch": 0.012106308521706233,
      "grad_norm": 14.31712037195988,
      "learning_rate": 3.2e-07,
      "loss": 8.0119,
      "step": 32
    },
    {
      "epoch": 0.012484630663009552,
      "grad_norm": 13.002942588027526,
      "learning_rate": 3.3e-07,
      "loss": 8.029,
      "step": 33
    },
    {
      "epoch": 0.012862952804312872,
      "grad_norm": 15.303670533896709,
      "learning_rate": 3.4000000000000003e-07,
      "loss": 7.9847,
      "step": 34
    },
    {
      "epoch": 0.013241274945616192,
      "grad_norm": 12.964425414274471,
      "learning_rate": 3.5e-07,
      "loss": 8.0026,
      "step": 35
    },
    {
      "epoch": 0.013619597086919512,
      "grad_norm": 19.040688578500415,
      "learning_rate": 3.6e-07,
      "loss": 8.0397,
      "step": 36
    },
    {
      "epoch": 0.013997919228222832,
      "grad_norm": 14.264527574014561,
      "learning_rate": 3.7e-07,
      "loss": 7.8472,
      "step": 37
    },
    {
      "epoch": 0.01437624136952615,
      "grad_norm": 14.259878980724565,
      "learning_rate": 3.7999999999999996e-07,
      "loss": 7.9499,
      "step": 38
    },
    {
      "epoch": 0.01475456351082947,
      "grad_norm": 21.02927607859569,
      "learning_rate": 3.8999999999999997e-07,
      "loss": 7.8521,
      "step": 39
    },
    {
      "epoch": 0.015132885652132791,
      "grad_norm": 16.308228829260607,
      "learning_rate": 4e-07,
      "loss": 7.8008,
      "step": 40
    },
    {
      "epoch": 0.015511207793436111,
      "grad_norm": 21.835730681754328,
      "learning_rate": 4.0999999999999994e-07,
      "loss": 7.7515,
      "step": 41
    },
    {
      "epoch": 0.01588952993473943,
      "grad_norm": 22.548471887636545,
      "learning_rate": 4.1999999999999995e-07,
      "loss": 7.7859,
      "step": 42
    },
    {
      "epoch": 0.01626785207604275,
      "grad_norm": 23.40758724577002,
      "learning_rate": 4.2999999999999996e-07,
      "loss": 7.7679,
      "step": 43
    },
    {
      "epoch": 0.01664617421734607,
      "grad_norm": 22.806229545212982,
      "learning_rate": 4.3999999999999997e-07,
      "loss": 7.7211,
      "step": 44
    },
    {
      "epoch": 0.01702449635864939,
      "grad_norm": 19.930882370057223,
      "learning_rate": 4.5e-07,
      "loss": 7.7017,
      "step": 45
    },
    {
      "epoch": 0.017402818499952708,
      "grad_norm": 17.292062567746196,
      "learning_rate": 4.6e-07,
      "loss": 7.7146,
      "step": 46
    },
    {
      "epoch": 0.01778114064125603,
      "grad_norm": 18.070618266890932,
      "learning_rate": 4.6999999999999995e-07,
      "loss": 7.7119,
      "step": 47
    },
    {
      "epoch": 0.01815946278255935,
      "grad_norm": 16.65539275683302,
      "learning_rate": 4.8e-07,
      "loss": 7.6178,
      "step": 48
    },
    {
      "epoch": 0.01853778492386267,
      "grad_norm": 19.36073786979339,
      "learning_rate": 4.9e-07,
      "loss": 7.6387,
      "step": 49
    },
    {
      "epoch": 0.01891610706516599,
      "grad_norm": 22.520853767642276,
      "learning_rate": 5e-07,
      "loss": 7.6346,
      "step": 50
    },
    {
      "epoch": 0.01929442920646931,
      "grad_norm": 21.674704957397896,
      "learning_rate": 5.1e-07,
      "loss": 7.5339,
      "step": 51
    },
    {
      "epoch": 0.01967275134777263,
      "grad_norm": 26.85039717209422,
      "learning_rate": 5.2e-07,
      "loss": 7.3655,
      "step": 52
    },
    {
      "epoch": 0.02005107348907595,
      "grad_norm": 29.784500661137994,
      "learning_rate": 5.3e-07,
      "loss": 7.3935,
      "step": 53
    },
    {
      "epoch": 0.02042939563037927,
      "grad_norm": 36.73803214173563,
      "learning_rate": 5.4e-07,
      "loss": 7.3942,
      "step": 54
    },
    {
      "epoch": 0.02080771777168259,
      "grad_norm": 55.998259201380826,
      "learning_rate": 5.5e-07,
      "loss": 7.3246,
      "step": 55
    },
    {
      "epoch": 0.021186039912985906,
      "grad_norm": 54.6219968094922,
      "learning_rate": 5.6e-07,
      "loss": 7.2241,
      "step": 56
    },
    {
      "epoch": 0.021564362054289226,
      "grad_norm": 115.48000957700997,
      "learning_rate": 5.699999999999999e-07,
      "loss": 7.3169,
      "step": 57
    },
    {
      "epoch": 0.021942684195592546,
      "grad_norm": 240.40441808566737,
      "learning_rate": 5.8e-07,
      "loss": 7.1243,
      "step": 58
    },
    {
      "epoch": 0.022321006336895866,
      "grad_norm": 102.2272021984647,
      "learning_rate": 5.9e-07,
      "loss": 7.0371,
      "step": 59
    },
    {
      "epoch": 0.022699328478199186,
      "grad_norm": 256.9288700751086,
      "learning_rate": 6e-07,
      "loss": 6.8907,
      "step": 60
    },
    {
      "epoch": 0.023077650619502506,
      "grad_norm": 131.56800170402965,
      "learning_rate": 6.1e-07,
      "loss": 6.854,
      "step": 61
    },
    {
      "epoch": 0.023455972760805827,
      "grad_norm": 358.2045690657579,
      "learning_rate": 6.2e-07,
      "loss": 6.7673,
      "step": 62
    },
    {
      "epoch": 0.023834294902109147,
      "grad_norm": 259.0360488341225,
      "learning_rate": 6.3e-07,
      "loss": 6.6898,
      "step": 63
    },
    {
      "epoch": 0.024212617043412467,
      "grad_norm": 324.46556421575104,
      "learning_rate": 6.4e-07,
      "loss": 6.6792,
      "step": 64
    },
    {
      "epoch": 0.024590939184715787,
      "grad_norm": 218.90309813691587,
      "learning_rate": 6.5e-07,
      "loss": 6.5833,
      "step": 65
    },
    {
      "epoch": 0.024969261326019104,
      "grad_norm": 345.9947605906595,
      "learning_rate": 6.6e-07,
      "loss": 6.5841,
      "step": 66
    },
    {
      "epoch": 0.025347583467322424,
      "grad_norm": 327.5192852015763,
      "learning_rate": 6.7e-07,
      "loss": 6.5379,
      "step": 67
    },
    {
      "epoch": 0.025725905608625744,
      "grad_norm": 272.0304082708135,
      "learning_rate": 6.800000000000001e-07,
      "loss": 6.4003,
      "step": 68
    },
    {
      "epoch": 0.026104227749929064,
      "grad_norm": 224.03062395364572,
      "learning_rate": 6.9e-07,
      "loss": 6.3064,
      "step": 69
    },
    {
      "epoch": 0.026482549891232384,
      "grad_norm": 326.13516923115037,
      "learning_rate": 7e-07,
      "loss": 6.2681,
      "step": 70
    },
    {
      "epoch": 0.026860872032535704,
      "grad_norm": 236.06386821993763,
      "learning_rate": 7.1e-07,
      "loss": 6.1658,
      "step": 71
    },
    {
      "epoch": 0.027239194173839024,
      "grad_norm": 117.09820504079929,
      "learning_rate": 7.2e-07,
      "loss": 6.1013,
      "step": 72
    },
    {
      "epoch": 0.027617516315142344,
      "grad_norm": 130.77996709008073,
      "learning_rate": 7.3e-07,
      "loss": 6.0313,
      "step": 73
    },
    {
      "epoch": 0.027995838456445665,
      "grad_norm": 184.1694406122909,
      "learning_rate": 7.4e-07,
      "loss": 5.9761,
      "step": 74
    },
    {
      "epoch": 0.028374160597748985,
      "grad_norm": 107.41668355609693,
      "learning_rate": 7.5e-07,
      "loss": 5.8533,
      "step": 75
    },
    {
      "epoch": 0.0287524827390523,
      "grad_norm": 167.17458055865583,
      "learning_rate": 7.599999999999999e-07,
      "loss": 5.842,
      "step": 76
    },
    {
      "epoch": 0.02913080488035562,
      "grad_norm": 83.1018765552699,
      "learning_rate": 7.699999999999999e-07,
      "loss": 5.8106,
      "step": 77
    },
    {
      "epoch": 0.02950912702165894,
      "grad_norm": 930.4199949174266,
      "learning_rate": 7.799999999999999e-07,
      "loss": 5.9417,
      "step": 78
    },
    {
      "epoch": 0.02988744916296226,
      "grad_norm": 344.9243101513464,
      "learning_rate": 7.9e-07,
      "loss": 5.9401,
      "step": 79
    },
    {
      "epoch": 0.030265771304265582,
      "grad_norm": 203.82832876269842,
      "learning_rate": 8e-07,
      "loss": 5.8335,
      "step": 80
    },
    {
      "epoch": 0.030644093445568902,
      "grad_norm": 303.4319382071192,
      "learning_rate": 8.1e-07,
      "loss": 5.6823,
      "step": 81
    },
    {
      "epoch": 0.031022415586872222,
      "grad_norm": 248.28331376619403,
      "learning_rate": 8.199999999999999e-07,
      "loss": 5.7745,
      "step": 82
    },
    {
      "epoch": 0.03140073772817554,
      "grad_norm": 462.20565983043144,
      "learning_rate": 8.299999999999999e-07,
      "loss": 5.6386,
      "step": 83
    },
    {
      "epoch": 0.03177905986947886,
      "grad_norm": 194.41981862598635,
      "learning_rate": 8.399999999999999e-07,
      "loss": 5.5997,
      "step": 84
    },
    {
      "epoch": 0.03215738201078218,
      "grad_norm": 293.3275031516269,
      "learning_rate": 8.499999999999999e-07,
      "loss": 5.5106,
      "step": 85
    },
    {
      "epoch": 0.0325357041520855,
      "grad_norm": 140.97321101678344,
      "learning_rate": 8.599999999999999e-07,
      "loss": 5.4563,
      "step": 86
    },
    {
      "epoch": 0.03291402629338882,
      "grad_norm": 180.15140475284437,
      "learning_rate": 8.699999999999999e-07,
      "loss": 5.4357,
      "step": 87
    },
    {
      "epoch": 0.03329234843469214,
      "grad_norm": 333.3719583206301,
      "learning_rate": 8.799999999999999e-07,
      "loss": 5.3168,
      "step": 88
    },
    {
      "epoch": 0.03367067057599546,
      "grad_norm": 121.82713201522955,
      "learning_rate": 8.9e-07,
      "loss": 5.3945,
      "step": 89
    },
    {
      "epoch": 0.03404899271729878,
      "grad_norm": 582.7969295558685,
      "learning_rate": 9e-07,
      "loss": 5.3863,
      "step": 90
    },
    {
      "epoch": 0.0344273148586021,
      "grad_norm": 217.6434706478821,
      "learning_rate": 9.1e-07,
      "loss": 5.2662,
      "step": 91
    },
    {
      "epoch": 0.034805636999905416,
      "grad_norm": 374.4674448505233,
      "learning_rate": 9.2e-07,
      "loss": 5.2355,
      "step": 92
    },
    {
      "epoch": 0.03518395914120874,
      "grad_norm": 218.23465312606612,
      "learning_rate": 9.3e-07,
      "loss": 5.1486,
      "step": 93
    },
    {
      "epoch": 0.03556228128251206,
      "grad_norm": 98.81927420372956,
      "learning_rate": 9.399999999999999e-07,
      "loss": 5.0807,
      "step": 94
    },
    {
      "epoch": 0.03594060342381538,
      "grad_norm": 211.12146153212487,
      "learning_rate": 9.499999999999999e-07,
      "loss": 5.0853,
      "step": 95
    },
    {
      "epoch": 0.0363189255651187,
      "grad_norm": 190.3736868117524,
      "learning_rate": 9.6e-07,
      "loss": 5.0756,
      "step": 96
    },
    {
      "epoch": 0.03669724770642202,
      "grad_norm": 122.03862248450174,
      "learning_rate": 9.7e-07,
      "loss": 4.9252,
      "step": 97
    },
    {
      "epoch": 0.03707556984772534,
      "grad_norm": 410.81026410608786,
      "learning_rate": 9.8e-07,
      "loss": 5.0664,
      "step": 98
    },
    {
      "epoch": 0.03745389198902866,
      "grad_norm": 269.97951212839484,
      "learning_rate": 9.9e-07,
      "loss": 4.9091,
      "step": 99
    },
    {
      "epoch": 0.03783221413033198,
      "grad_norm": 260.7212338620472,
      "learning_rate": 1e-06,
      "loss": 4.8821,
      "step": 100
    },
    {
      "epoch": 0.0382105362716353,
      "grad_norm": 165.92539323350238,
      "learning_rate": 1.0099999999999999e-06,
      "loss": 4.7469,
      "step": 101
    },
    {
      "epoch": 0.03858885841293862,
      "grad_norm": 281.9862388742268,
      "learning_rate": 1.02e-06,
      "loss": 4.7974,
      "step": 102
    },
    {
      "epoch": 0.038967180554241934,
      "grad_norm": 164.28597977866295,
      "learning_rate": 1.0299999999999999e-06,
      "loss": 4.6513,
      "step": 103
    },
    {
      "epoch": 0.03934550269554526,
      "grad_norm": 315.7550450358392,
      "learning_rate": 1.04e-06,
      "loss": 4.7021,
      "step": 104
    },
    {
      "epoch": 0.039723824836848574,
      "grad_norm": 202.93065604656107,
      "learning_rate": 1.05e-06,
      "loss": 4.5712,
      "step": 105
    },
    {
      "epoch": 0.0401021469781519,
      "grad_norm": 210.26805622762828,
      "learning_rate": 1.06e-06,
      "loss": 4.6196,
      "step": 106
    },
    {
      "epoch": 0.040480469119455215,
      "grad_norm": 187.14917857744504,
      "learning_rate": 1.07e-06,
      "loss": 4.5484,
      "step": 107
    },
    {
      "epoch": 0.04085879126075854,
      "grad_norm": 155.43076076847103,
      "learning_rate": 1.08e-06,
      "loss": 4.4144,
      "step": 108
    },
    {
      "epoch": 0.041237113402061855,
      "grad_norm": 154.98829996861681,
      "learning_rate": 1.09e-06,
      "loss": 4.3404,
      "step": 109
    },
    {
      "epoch": 0.04161543554336518,
      "grad_norm": 141.595366217918,
      "learning_rate": 1.1e-06,
      "loss": 4.3111,
      "step": 110
    },
    {
      "epoch": 0.041993757684668495,
      "grad_norm": 134.27240833451944,
      "learning_rate": 1.11e-06,
      "loss": 4.1952,
      "step": 111
    },
    {
      "epoch": 0.04237207982597181,
      "grad_norm": 95.65375597330166,
      "learning_rate": 1.12e-06,
      "loss": 4.0809,
      "step": 112
    },
    {
      "epoch": 0.042750401967275135,
      "grad_norm": 109.07352101322023,
      "learning_rate": 1.1299999999999998e-06,
      "loss": 4.0286,
      "step": 113
    },
    {
      "epoch": 0.04312872410857845,
      "grad_norm": 114.47547920727833,
      "learning_rate": 1.1399999999999999e-06,
      "loss": 3.9147,
      "step": 114
    },
    {
      "epoch": 0.043507046249881776,
      "grad_norm": 105.22542090856187,
      "learning_rate": 1.1499999999999998e-06,
      "loss": 3.888,
      "step": 115
    },
    {
      "epoch": 0.04388536839118509,
      "grad_norm": 170.85609503557524,
      "learning_rate": 1.16e-06,
      "loss": 3.7806,
      "step": 116
    },
    {
      "epoch": 0.044263690532488416,
      "grad_norm": 132.60484964177928,
      "learning_rate": 1.1699999999999998e-06,
      "loss": 3.7388,
      "step": 117
    },
    {
      "epoch": 0.04464201267379173,
      "grad_norm": 817.4981900388101,
      "learning_rate": 1.18e-06,
      "loss": 3.8085,
      "step": 118
    },
    {
      "epoch": 0.045020334815095056,
      "grad_norm": 277.2968095396992,
      "learning_rate": 1.1899999999999998e-06,
      "loss": 3.7519,
      "step": 119
    },
    {
      "epoch": 0.04539865695639837,
      "grad_norm": 242.3036172020571,
      "learning_rate": 1.2e-06,
      "loss": 3.6811,
      "step": 120
    },
    {
      "epoch": 0.045776979097701696,
      "grad_norm": 147.12958250512,
      "learning_rate": 1.2099999999999998e-06,
      "loss": 3.5537,
      "step": 121
    },
    {
      "epoch": 0.04615530123900501,
      "grad_norm": 304.91416915276426,
      "learning_rate": 1.22e-06,
      "loss": 3.5308,
      "step": 122
    },
    {
      "epoch": 0.04653362338030833,
      "grad_norm": 228.8092972324273,
      "learning_rate": 1.2299999999999999e-06,
      "loss": 3.4916,
      "step": 123
    },
    {
      "epoch": 0.04691194552161165,
      "grad_norm": 197.353832945714,
      "learning_rate": 1.24e-06,
      "loss": 3.4215,
      "step": 124
    },
    {
      "epoch": 0.04729026766291497,
      "grad_norm": 228.72368996651358,
      "learning_rate": 1.2499999999999999e-06,
      "loss": 3.371,
      "step": 125
    },
    {
      "epoch": 0.04766858980421829,
      "grad_norm": 164.2731725612326,
      "learning_rate": 1.26e-06,
      "loss": 3.3909,
      "step": 126
    },
    {
      "epoch": 0.04804691194552161,
      "grad_norm": 186.5826183173996,
      "learning_rate": 1.27e-06,
      "loss": 3.3104,
      "step": 127
    },
    {
      "epoch": 0.048425234086824934,
      "grad_norm": 139.94786192019586,
      "learning_rate": 1.28e-06,
      "loss": 3.2437,
      "step": 128
    },
    {
      "epoch": 0.04880355622812825,
      "grad_norm": 170.89837594203516,
      "learning_rate": 1.29e-06,
      "loss": 3.2145,
      "step": 129
    },
    {
      "epoch": 0.049181878369431574,
      "grad_norm": 124.04755267516651,
      "learning_rate": 1.3e-06,
      "loss": 3.1275,
      "step": 130
    },
    {
      "epoch": 0.04956020051073489,
      "grad_norm": 112.7475091581948,
      "learning_rate": 1.31e-06,
      "loss": 3.1021,
      "step": 131
    },
    {
      "epoch": 0.04993852265203821,
      "grad_norm": 483.6676734928997,
      "learning_rate": 1.32e-06,
      "loss": 3.0251,
      "step": 132
    },
    {
      "epoch": 0.05031684479334153,
      "grad_norm": 131.48794283663062,
      "learning_rate": 1.33e-06,
      "loss": 3.0474,
      "step": 133
    },
    {
      "epoch": 0.05031684479334153,
      "eval_loss": 3.0402355194091797,
      "eval_runtime": 26.8305,
      "eval_samples_per_second": 32.985,
      "eval_steps_per_second": 1.044,
      "step": 133
    },
    {
      "epoch": 0.05031684479334153,
      "eval_bench_accuracy_arc_challenge": 0.2714285714285714,
      "eval_bench_accuracy_hellaswag": 0.22,
      "eval_bench_accuracy_mmlu": 0.23478260869565218,
      "eval_bench_average_accuracy": 0.2420703933747412,
      "eval_bench_loss": 6.577301560786733,
      "eval_bench_total_accuracy": 0.23956043956043957,
      "step": 133
    },
    {
      "epoch": 0.05069516693464485,
      "grad_norm": 664.2692049220283,
      "learning_rate": 1.34e-06,
      "loss": 3.0489,
      "step": 134
    },
    {
      "epoch": 0.05107348907594817,
      "grad_norm": 164.70902413028506,
      "learning_rate": 1.35e-06,
      "loss": 3.0729,
      "step": 135
    },
    {
      "epoch": 0.05145181121725149,
      "grad_norm": 778.4019675411471,
      "learning_rate": 1.3600000000000001e-06,
      "loss": 2.9025,
      "step": 136
    },
    {
      "epoch": 0.05183013335855481,
      "grad_norm": 141.784859477734,
      "learning_rate": 1.37e-06,
      "loss": 2.9153,
      "step": 137
    },
    {
      "epoch": 0.05220845549985813,
      "grad_norm": 815.6337164546584,
      "learning_rate": 1.38e-06,
      "loss": 2.9767,
      "step": 138
    },
    {
      "epoch": 0.05258677764116145,
      "grad_norm": 387.14144869932585,
      "learning_rate": 1.3899999999999998e-06,
      "loss": 2.9545,
      "step": 139
    },
    {
      "epoch": 0.05296509978246477,
      "grad_norm": 1286.7446765387322,
      "learning_rate": 1.4e-06,
      "loss": 2.9779,
      "step": 140
    },
    {
      "epoch": 0.05334342192376809,
      "grad_norm": 170.85639571110613,
      "learning_rate": 1.4099999999999998e-06,
      "loss": 2.8642,
      "step": 141
    },
    {
      "epoch": 0.05372174406507141,
      "grad_norm": 375.24244542748465,
      "learning_rate": 1.42e-06,
      "loss": 2.7942,
      "step": 142
    },
    {
      "epoch": 0.054100066206374725,
      "grad_norm": 154.53620941237315,
      "learning_rate": 1.4299999999999999e-06,
      "loss": 2.7527,
      "step": 143
    },
    {
      "epoch": 0.05447838834767805,
      "grad_norm": 188.97826644064364,
      "learning_rate": 1.44e-06,
      "loss": 2.7492,
      "step": 144
    },
    {
      "epoch": 0.054856710488981365,
      "grad_norm": 103.19619548153565,
      "learning_rate": 1.4499999999999999e-06,
      "loss": 2.6708,
      "step": 145
    },
    {
      "epoch": 0.05523503263028469,
      "grad_norm": 125.47407228350237,
      "learning_rate": 1.46e-06,
      "loss": 2.6737,
      "step": 146
    },
    {
      "epoch": 0.055613354771588006,
      "grad_norm": 71.31808903587059,
      "learning_rate": 1.47e-06,
      "loss": 2.6175,
      "step": 147
    },
    {
      "epoch": 0.05599167691289133,
      "grad_norm": 158.4470726659215,
      "learning_rate": 1.48e-06,
      "loss": 2.5772,
      "step": 148
    },
    {
      "epoch": 0.056369999054194646,
      "grad_norm": 213.54517556280484,
      "learning_rate": 1.49e-06,
      "loss": 2.5397,
      "step": 149
    },
    {
      "epoch": 0.05674832119549797,
      "grad_norm": 94.87447540886092,
      "learning_rate": 1.5e-06,
      "loss": 2.5007,
      "step": 150
    },
    {
      "epoch": 0.057126643336801286,
      "grad_norm": 140.6331701396571,
      "learning_rate": 1.51e-06,
      "loss": 2.4911,
      "step": 151
    },
    {
      "epoch": 0.0575049654781046,
      "grad_norm": 71.42229734282893,
      "learning_rate": 1.5199999999999998e-06,
      "loss": 2.3964,
      "step": 152
    },
    {
      "epoch": 0.057883287619407926,
      "grad_norm": 100.92797990716835,
      "learning_rate": 1.53e-06,
      "loss": 2.3796,
      "step": 153
    },
    {
      "epoch": 0.05826160976071124,
      "grad_norm": 69.12965458867137,
      "learning_rate": 1.5399999999999999e-06,
      "loss": 2.4147,
      "step": 154
    },
    {
      "epoch": 0.058639931902014567,
      "grad_norm": 68.31144568523656,
      "learning_rate": 1.55e-06,
      "loss": 2.285,
      "step": 155
    },
    {
      "epoch": 0.05901825404331788,
      "grad_norm": 63.86407191747168,
      "learning_rate": 1.5599999999999999e-06,
      "loss": 2.2905,
      "step": 156
    },
    {
      "epoch": 0.05939657618462121,
      "grad_norm": 89.9702991999028,
      "learning_rate": 1.57e-06,
      "loss": 2.2642,
      "step": 157
    },
    {
      "epoch": 0.05977489832592452,
      "grad_norm": 38.70583191014119,
      "learning_rate": 1.58e-06,
      "loss": 2.1927,
      "step": 158
    },
    {
      "epoch": 0.06015322046722785,
      "grad_norm": 150.0176513817121,
      "learning_rate": 1.59e-06,
      "loss": 2.2046,
      "step": 159
    },
    {
      "epoch": 0.060531542608531164,
      "grad_norm": 85.38752600608713,
      "learning_rate": 1.6e-06,
      "loss": 2.1777,
      "step": 160
    },
    {
      "epoch": 0.06090986474983449,
      "grad_norm": 108.46382637315519,
      "learning_rate": 1.61e-06,
      "loss": 2.0947,
      "step": 161
    },
    {
      "epoch": 0.061288186891137804,
      "grad_norm": 72.33751976980996,
      "learning_rate": 1.62e-06,
      "loss": 2.1455,
      "step": 162
    },
    {
      "epoch": 0.06166650903244112,
      "grad_norm": 254.7588636023186,
      "learning_rate": 1.6299999999999999e-06,
      "loss": 2.0967,
      "step": 163
    },
    {
      "epoch": 0.062044831173744444,
      "grad_norm": 143.3727693773649,
      "learning_rate": 1.6399999999999998e-06,
      "loss": 2.0443,
      "step": 164
    },
    {
      "epoch": 0.06242315331504776,
      "grad_norm": 672.6219381081797,
      "learning_rate": 1.6499999999999999e-06,
      "loss": 2.2139,
      "step": 165
    },
    {
      "epoch": 0.06280147545635108,
      "grad_norm": 89.69156829747156,
      "learning_rate": 1.6599999999999998e-06,
      "loss": 2.0433,
      "step": 166
    },
    {
      "epoch": 0.06317979759765441,
      "grad_norm": 47.054580203479496,
      "learning_rate": 1.6699999999999999e-06,
      "loss": 1.9805,
      "step": 167
    },
    {
      "epoch": 0.06355811973895772,
      "grad_norm": 53.90193516042071,
      "learning_rate": 1.6799999999999998e-06,
      "loss": 1.8572,
      "step": 168
    },
    {
      "epoch": 0.06393644188026104,
      "grad_norm": 55.351958687059195,
      "learning_rate": 1.69e-06,
      "loss": 1.8879,
      "step": 169
    },
    {
      "epoch": 0.06431476402156436,
      "grad_norm": 30.956994176305464,
      "learning_rate": 1.6999999999999998e-06,
      "loss": 1.8335,
      "step": 170
    },
    {
      "epoch": 0.06469308616286769,
      "grad_norm": 81.23380900946358,
      "learning_rate": 1.71e-06,
      "loss": 1.8101,
      "step": 171
    },
    {
      "epoch": 0.065071408304171,
      "grad_norm": 46.43733520396148,
      "learning_rate": 1.7199999999999998e-06,
      "loss": 1.8177,
      "step": 172
    },
    {
      "epoch": 0.06544973044547432,
      "grad_norm": 46.90830376181402,
      "learning_rate": 1.73e-06,
      "loss": 1.7543,
      "step": 173
    },
    {
      "epoch": 0.06582805258677764,
      "grad_norm": 69.19161149417722,
      "learning_rate": 1.7399999999999999e-06,
      "loss": 1.7712,
      "step": 174
    },
    {
      "epoch": 0.06620637472808096,
      "grad_norm": 46.99692135130498,
      "learning_rate": 1.75e-06,
      "loss": 1.7728,
      "step": 175
    },
    {
      "epoch": 0.06658469686938429,
      "grad_norm": 85.68605330443327,
      "learning_rate": 1.7599999999999999e-06,
      "loss": 1.7186,
      "step": 176
    },
    {
      "epoch": 0.0669630190106876,
      "grad_norm": 48.57963404347663,
      "learning_rate": 1.77e-06,
      "loss": 1.6979,
      "step": 177
    },
    {
      "epoch": 0.06734134115199092,
      "grad_norm": 111.44637207499896,
      "learning_rate": 1.78e-06,
      "loss": 1.734,
      "step": 178
    },
    {
      "epoch": 0.06771966329329424,
      "grad_norm": 83.89157732570692,
      "learning_rate": 1.79e-06,
      "loss": 1.6947,
      "step": 179
    },
    {
      "epoch": 0.06809798543459757,
      "grad_norm": 50.66006983599147,
      "learning_rate": 1.8e-06,
      "loss": 1.6385,
      "step": 180
    },
    {
      "epoch": 0.06847630757590088,
      "grad_norm": 47.32959657636825,
      "learning_rate": 1.81e-06,
      "loss": 1.5717,
      "step": 181
    },
    {
      "epoch": 0.0688546297172042,
      "grad_norm": 71.70671420810187,
      "learning_rate": 1.82e-06,
      "loss": 1.5167,
      "step": 182
    },
    {
      "epoch": 0.06923295185850752,
      "grad_norm": 48.11379424928171,
      "learning_rate": 1.83e-06,
      "loss": 1.5992,
      "step": 183
    },
    {
      "epoch": 0.06961127399981083,
      "grad_norm": 54.01731463177801,
      "learning_rate": 1.84e-06,
      "loss": 1.5217,
      "step": 184
    },
    {
      "epoch": 0.06998959614111416,
      "grad_norm": 39.52299725178149,
      "learning_rate": 1.85e-06,
      "loss": 1.5009,
      "step": 185
    },
    {
      "epoch": 0.07036791828241748,
      "grad_norm": 63.37058186080119,
      "learning_rate": 1.86e-06,
      "loss": 1.5853,
      "step": 186
    },
    {
      "epoch": 0.0707462404237208,
      "grad_norm": 44.5116426583779,
      "learning_rate": 1.87e-06,
      "loss": 1.4865,
      "step": 187
    },
    {
      "epoch": 0.07112456256502411,
      "grad_norm": 40.56409454228496,
      "learning_rate": 1.8799999999999998e-06,
      "loss": 1.4732,
      "step": 188
    },
    {
      "epoch": 0.07150288470632744,
      "grad_norm": 31.923505092753718,
      "learning_rate": 1.89e-06,
      "loss": 1.4519,
      "step": 189
    },
    {
      "epoch": 0.07188120684763076,
      "grad_norm": 34.50709112981039,
      "learning_rate": 1.8999999999999998e-06,
      "loss": 1.4205,
      "step": 190
    },
    {
      "epoch": 0.07225952898893408,
      "grad_norm": 22.09682402936458,
      "learning_rate": 1.91e-06,
      "loss": 1.38,
      "step": 191
    },
    {
      "epoch": 0.0726378511302374,
      "grad_norm": 25.3767669172789,
      "learning_rate": 1.92e-06,
      "loss": 1.3879,
      "step": 192
    },
    {
      "epoch": 0.07301617327154071,
      "grad_norm": 29.51813748066488,
      "learning_rate": 1.9299999999999997e-06,
      "loss": 1.3506,
      "step": 193
    },
    {
      "epoch": 0.07339449541284404,
      "grad_norm": 21.76501410574832,
      "learning_rate": 1.94e-06,
      "loss": 1.3237,
      "step": 194
    },
    {
      "epoch": 0.07377281755414736,
      "grad_norm": 20.74781891582525,
      "learning_rate": 1.95e-06,
      "loss": 1.3639,
      "step": 195
    },
    {
      "epoch": 0.07415113969545067,
      "grad_norm": 27.66733930317673,
      "learning_rate": 1.96e-06,
      "loss": 1.3061,
      "step": 196
    },
    {
      "epoch": 0.07452946183675399,
      "grad_norm": 21.087698250942193,
      "learning_rate": 1.9699999999999998e-06,
      "loss": 1.375,
      "step": 197
    },
    {
      "epoch": 0.07490778397805732,
      "grad_norm": 22.065927379036225,
      "learning_rate": 1.98e-06,
      "loss": 1.3219,
      "step": 198
    },
    {
      "epoch": 0.07528610611936064,
      "grad_norm": 37.132637966902955,
      "learning_rate": 1.99e-06,
      "loss": 1.2424,
      "step": 199
    },
    {
      "epoch": 0.07566442826066395,
      "grad_norm": 20.85100061426098,
      "learning_rate": 2e-06,
      "loss": 1.2973,
      "step": 200
    },
    {
      "epoch": 0.07604275040196727,
      "grad_norm": 19.748272671220768,
      "learning_rate": 2.01e-06,
      "loss": 1.2371,
      "step": 201
    },
    {
      "epoch": 0.0764210725432706,
      "grad_norm": 24.073543088140834,
      "learning_rate": 2.0199999999999997e-06,
      "loss": 1.252,
      "step": 202
    },
    {
      "epoch": 0.07679939468457392,
      "grad_norm": 34.22154387867275,
      "learning_rate": 2.0299999999999996e-06,
      "loss": 1.2911,
      "step": 203
    },
    {
      "epoch": 0.07717771682587724,
      "grad_norm": 16.511181722757403,
      "learning_rate": 2.04e-06,
      "loss": 1.2321,
      "step": 204
    },
    {
      "epoch": 0.07755603896718055,
      "grad_norm": 12.872226386234452,
      "learning_rate": 2.05e-06,
      "loss": 1.1767,
      "step": 205
    },
    {
      "epoch": 0.07793436110848387,
      "grad_norm": 15.436365816346868,
      "learning_rate": 2.0599999999999998e-06,
      "loss": 1.1955,
      "step": 206
    },
    {
      "epoch": 0.0783126832497872,
      "grad_norm": 12.062107586682833,
      "learning_rate": 2.0699999999999997e-06,
      "loss": 1.1799,
      "step": 207
    },
    {
      "epoch": 0.07869100539109052,
      "grad_norm": 49.38765930014822,
      "learning_rate": 2.08e-06,
      "loss": 1.1762,
      "step": 208
    },
    {
      "epoch": 0.07906932753239383,
      "grad_norm": 23.38441549316206,
      "learning_rate": 2.09e-06,
      "loss": 1.1831,
      "step": 209
    },
    {
      "epoch": 0.07944764967369715,
      "grad_norm": 22.28035230836217,
      "learning_rate": 2.1e-06,
      "loss": 1.1858,
      "step": 210
    },
    {
      "epoch": 0.07982597181500048,
      "grad_norm": 43.05138932031075,
      "learning_rate": 2.1099999999999997e-06,
      "loss": 1.2106,
      "step": 211
    },
    {
      "epoch": 0.0802042939563038,
      "grad_norm": 22.919581037837645,
      "learning_rate": 2.12e-06,
      "loss": 1.1872,
      "step": 212
    },
    {
      "epoch": 0.08058261609760711,
      "grad_norm": 106.27528509092721,
      "learning_rate": 2.13e-06,
      "loss": 1.1807,
      "step": 213
    },
    {
      "epoch": 0.08096093823891043,
      "grad_norm": 62.766496496977574,
      "learning_rate": 2.14e-06,
      "loss": 1.1932,
      "step": 214
    },
    {
      "epoch": 0.08133926038021375,
      "grad_norm": 66.54674237816508,
      "learning_rate": 2.1499999999999997e-06,
      "loss": 1.1328,
      "step": 215
    },
    {
      "epoch": 0.08171758252151708,
      "grad_norm": 66.81453157766589,
      "learning_rate": 2.16e-06,
      "loss": 1.1613,
      "step": 216
    },
    {
      "epoch": 0.0820959046628204,
      "grad_norm": 35.57901795776919,
      "learning_rate": 2.17e-06,
      "loss": 1.1821,
      "step": 217
    },
    {
      "epoch": 0.08247422680412371,
      "grad_norm": 10.30900211340774,
      "learning_rate": 2.18e-06,
      "loss": 1.1023,
      "step": 218
    },
    {
      "epoch": 0.08285254894542703,
      "grad_norm": 29.533042017371177,
      "learning_rate": 2.1899999999999998e-06,
      "loss": 1.1669,
      "step": 219
    },
    {
      "epoch": 0.08323087108673036,
      "grad_norm": 22.47096674174166,
      "learning_rate": 2.2e-06,
      "loss": 1.1612,
      "step": 220
    },
    {
      "epoch": 0.08360919322803367,
      "grad_norm": 13.583126551810135,
      "learning_rate": 2.21e-06,
      "loss": 1.0867,
      "step": 221
    },
    {
      "epoch": 0.08398751536933699,
      "grad_norm": 9.91479302526445,
      "learning_rate": 2.22e-06,
      "loss": 1.0916,
      "step": 222
    },
    {
      "epoch": 0.0843658375106403,
      "grad_norm": 11.269431287067826,
      "learning_rate": 2.23e-06,
      "loss": 1.1264,
      "step": 223
    },
    {
      "epoch": 0.08474415965194362,
      "grad_norm": 7.7465735801712805,
      "learning_rate": 2.24e-06,
      "loss": 1.136,
      "step": 224
    },
    {
      "epoch": 0.08512248179324695,
      "grad_norm": 8.687635755465738,
      "learning_rate": 2.25e-06,
      "loss": 1.0803,
      "step": 225
    },
    {
      "epoch": 0.08550080393455027,
      "grad_norm": 11.628437205512707,
      "learning_rate": 2.2599999999999995e-06,
      "loss": 1.1646,
      "step": 226
    },
    {
      "epoch": 0.08587912607585359,
      "grad_norm": 9.268721256498573,
      "learning_rate": 2.27e-06,
      "loss": 1.1015,
      "step": 227
    },
    {
      "epoch": 0.0862574482171569,
      "grad_norm": 6.187500026884083,
      "learning_rate": 2.2799999999999998e-06,
      "loss": 1.0662,
      "step": 228
    },
    {
      "epoch": 0.08663577035846023,
      "grad_norm": 8.62028463677054,
      "learning_rate": 2.29e-06,
      "loss": 1.052,
      "step": 229
    },
    {
      "epoch": 0.08701409249976355,
      "grad_norm": 9.674790887814405,
      "learning_rate": 2.2999999999999996e-06,
      "loss": 1.0978,
      "step": 230
    },
    {
      "epoch": 0.08739241464106687,
      "grad_norm": 8.326705028491853,
      "learning_rate": 2.31e-06,
      "loss": 1.0184,
      "step": 231
    },
    {
      "epoch": 0.08777073678237018,
      "grad_norm": 7.318027642173224,
      "learning_rate": 2.32e-06,
      "loss": 1.0509,
      "step": 232
    },
    {
      "epoch": 0.0881490589236735,
      "grad_norm": 12.85041462496061,
      "learning_rate": 2.33e-06,
      "loss": 1.0556,
      "step": 233
    },
    {
      "epoch": 0.08852738106497683,
      "grad_norm": 9.328207044954535,
      "learning_rate": 2.3399999999999996e-06,
      "loss": 1.0816,
      "step": 234
    },
    {
      "epoch": 0.08890570320628015,
      "grad_norm": 7.022150416570471,
      "learning_rate": 2.35e-06,
      "loss": 1.0466,
      "step": 235
    },
    {
      "epoch": 0.08928402534758346,
      "grad_norm": 8.86057501782776,
      "learning_rate": 2.36e-06,
      "loss": 1.04,
      "step": 236
    },
    {
      "epoch": 0.08966234748888678,
      "grad_norm": 9.072613041437753,
      "learning_rate": 2.37e-06,
      "loss": 1.039,
      "step": 237
    },
    {
      "epoch": 0.09004066963019011,
      "grad_norm": 11.561198612520238,
      "learning_rate": 2.3799999999999997e-06,
      "loss": 1.025,
      "step": 238
    },
    {
      "epoch": 0.09041899177149343,
      "grad_norm": 5.796410505813014,
      "learning_rate": 2.39e-06,
      "loss": 1.0007,
      "step": 239
    },
    {
      "epoch": 0.09079731391279675,
      "grad_norm": 13.451590053171754,
      "learning_rate": 2.4e-06,
      "loss": 1.0051,
      "step": 240
    },
    {
      "epoch": 0.09117563605410006,
      "grad_norm": 8.917436837849364,
      "learning_rate": 2.4100000000000002e-06,
      "loss": 1.0866,
      "step": 241
    },
    {
      "epoch": 0.09155395819540339,
      "grad_norm": 4.792174398814023,
      "learning_rate": 2.4199999999999997e-06,
      "loss": 1.0022,
      "step": 242
    },
    {
      "epoch": 0.09193228033670671,
      "grad_norm": 6.487991210049911,
      "learning_rate": 2.43e-06,
      "loss": 0.976,
      "step": 243
    },
    {
      "epoch": 0.09231060247801003,
      "grad_norm": 9.885175529767102,
      "learning_rate": 2.44e-06,
      "loss": 1.0038,
      "step": 244
    },
    {
      "epoch": 0.09268892461931334,
      "grad_norm": 5.6067215406645134,
      "learning_rate": 2.4500000000000003e-06,
      "loss": 1.0559,
      "step": 245
    },
    {
      "epoch": 0.09306724676061666,
      "grad_norm": 14.632584569195519,
      "learning_rate": 2.4599999999999997e-06,
      "loss": 1.0229,
      "step": 246
    },
    {
      "epoch": 0.09344556890191999,
      "grad_norm": 6.406784955802286,
      "learning_rate": 2.47e-06,
      "loss": 1.0252,
      "step": 247
    },
    {
      "epoch": 0.0938238910432233,
      "grad_norm": 7.547314965665046,
      "learning_rate": 2.48e-06,
      "loss": 0.9838,
      "step": 248
    },
    {
      "epoch": 0.09420221318452662,
      "grad_norm": 6.44920071987235,
      "learning_rate": 2.4900000000000003e-06,
      "loss": 0.9664,
      "step": 249
    },
    {
      "epoch": 0.09458053532582994,
      "grad_norm": 5.4686676744513765,
      "learning_rate": 2.4999999999999998e-06,
      "loss": 0.9781,
      "step": 250
    },
    {
      "epoch": 0.09495885746713327,
      "grad_norm": 5.951563165398436,
      "learning_rate": 2.5099999999999997e-06,
      "loss": 0.9953,
      "step": 251
    },
    {
      "epoch": 0.09533717960843659,
      "grad_norm": 5.7316411610727105,
      "learning_rate": 2.52e-06,
      "loss": 1.0431,
      "step": 252
    },
    {
      "epoch": 0.0957155017497399,
      "grad_norm": 4.90373215304178,
      "learning_rate": 2.5299999999999995e-06,
      "loss": 0.9738,
      "step": 253
    },
    {
      "epoch": 0.09609382389104322,
      "grad_norm": 4.018027173598048,
      "learning_rate": 2.54e-06,
      "loss": 1.0113,
      "step": 254
    },
    {
      "epoch": 0.09647214603234654,
      "grad_norm": 6.869682846334475,
      "learning_rate": 2.5499999999999997e-06,
      "loss": 0.9812,
      "step": 255
    },
    {
      "epoch": 0.09685046817364987,
      "grad_norm": 5.959477622367862,
      "learning_rate": 2.56e-06,
      "loss": 1.0031,
      "step": 256
    },
    {
      "epoch": 0.09722879031495318,
      "grad_norm": 4.231167141984737,
      "learning_rate": 2.5699999999999995e-06,
      "loss": 1.0319,
      "step": 257
    },
    {
      "epoch": 0.0976071124562565,
      "grad_norm": 6.714523011394094,
      "learning_rate": 2.58e-06,
      "loss": 0.9851,
      "step": 258
    },
    {
      "epoch": 0.09798543459755982,
      "grad_norm": 6.020515136070658,
      "learning_rate": 2.5899999999999998e-06,
      "loss": 0.9782,
      "step": 259
    },
    {
      "epoch": 0.09836375673886315,
      "grad_norm": 4.681331319695956,
      "learning_rate": 2.6e-06,
      "loss": 1.014,
      "step": 260
    },
    {
      "epoch": 0.09874207888016646,
      "grad_norm": 7.4305112606450905,
      "learning_rate": 2.6099999999999996e-06,
      "loss": 0.9751,
      "step": 261
    },
    {
      "epoch": 0.09912040102146978,
      "grad_norm": 3.819753600694035,
      "learning_rate": 2.62e-06,
      "loss": 0.968,
      "step": 262
    },
    {
      "epoch": 0.0994987231627731,
      "grad_norm": 5.789415532330102,
      "learning_rate": 2.63e-06,
      "loss": 0.9529,
      "step": 263
    },
    {
      "epoch": 0.09987704530407641,
      "grad_norm": 4.539898474801753,
      "learning_rate": 2.64e-06,
      "loss": 0.978,
      "step": 264
    },
    {
      "epoch": 0.10025536744537974,
      "grad_norm": 3.2389391663703306,
      "learning_rate": 2.6499999999999996e-06,
      "loss": 0.9833,
      "step": 265
    },
    {
      "epoch": 0.10063368958668306,
      "grad_norm": 5.4718084763112556,
      "learning_rate": 2.66e-06,
      "loss": 0.9714,
      "step": 266
    },
    {
      "epoch": 0.10063368958668306,
      "eval_loss": 0.9851981997489929,
      "eval_runtime": 27.2115,
      "eval_samples_per_second": 32.523,
      "eval_steps_per_second": 1.029,
      "step": 266
    },
    {
      "epoch": 0.10063368958668306,
      "eval_bench_accuracy_arc_challenge": 0.29285714285714287,
      "eval_bench_accuracy_hellaswag": 0.215,
      "eval_bench_accuracy_mmlu": 0.3826086956521739,
      "eval_bench_average_accuracy": 0.29682194616977225,
      "eval_bench_loss": 6.3663490696957235,
      "eval_bench_total_accuracy": 0.2813186813186813,
      "step": 266
    },
    {
      "epoch": 0.10101201172798638,
      "grad_norm": 4.736473735176666,
      "learning_rate": 2.67e-06,
      "loss": 1.0245,
      "step": 267
    },
    {
      "epoch": 0.1013903338692897,
      "grad_norm": 2.927740836124029,
      "learning_rate": 2.68e-06,
      "loss": 0.9906,
      "step": 268
    },
    {
      "epoch": 0.10176865601059303,
      "grad_norm": 4.622383990826824,
      "learning_rate": 2.6899999999999997e-06,
      "loss": 0.9679,
      "step": 269
    },
    {
      "epoch": 0.10214697815189634,
      "grad_norm": 3.8746535383849836,
      "learning_rate": 2.7e-06,
      "loss": 0.9211,
      "step": 270
    },
    {
      "epoch": 0.10252530029319966,
      "grad_norm": 4.361727224982868,
      "learning_rate": 2.71e-06,
      "loss": 0.9779,
      "step": 271
    },
    {
      "epoch": 0.10290362243450298,
      "grad_norm": 3.2847575684010795,
      "learning_rate": 2.7200000000000002e-06,
      "loss": 0.969,
      "step": 272
    },
    {
      "epoch": 0.1032819445758063,
      "grad_norm": 2.946259099361567,
      "learning_rate": 2.7299999999999997e-06,
      "loss": 0.9374,
      "step": 273
    },
    {
      "epoch": 0.10366026671710962,
      "grad_norm": 3.5163454504687364,
      "learning_rate": 2.74e-06,
      "loss": 0.9809,
      "step": 274
    },
    {
      "epoch": 0.10403858885841294,
      "grad_norm": 4.1448737340815045,
      "learning_rate": 2.75e-06,
      "loss": 0.9816,
      "step": 275
    },
    {
      "epoch": 0.10441691099971626,
      "grad_norm": 3.345900089125294,
      "learning_rate": 2.76e-06,
      "loss": 0.94,
      "step": 276
    },
    {
      "epoch": 0.10479523314101957,
      "grad_norm": 4.756231356260067,
      "learning_rate": 2.7699999999999997e-06,
      "loss": 0.9948,
      "step": 277
    },
    {
      "epoch": 0.1051735552823229,
      "grad_norm": 3.395795830645774,
      "learning_rate": 2.7799999999999996e-06,
      "loss": 0.9852,
      "step": 278
    },
    {
      "epoch": 0.10555187742362622,
      "grad_norm": 3.7361359597792085,
      "learning_rate": 2.79e-06,
      "loss": 0.9705,
      "step": 279
    },
    {
      "epoch": 0.10593019956492954,
      "grad_norm": 2.9021780470974536,
      "learning_rate": 2.8e-06,
      "loss": 0.9517,
      "step": 280
    },
    {
      "epoch": 0.10630852170623285,
      "grad_norm": 3.3140561096891408,
      "learning_rate": 2.8099999999999998e-06,
      "loss": 0.9518,
      "step": 281
    },
    {
      "epoch": 0.10668684384753618,
      "grad_norm": 4.955772041684827,
      "learning_rate": 2.8199999999999997e-06,
      "loss": 0.949,
      "step": 282
    },
    {
      "epoch": 0.1070651659888395,
      "grad_norm": 2.7495737336593447,
      "learning_rate": 2.83e-06,
      "loss": 0.9637,
      "step": 283
    },
    {
      "epoch": 0.10744348813014282,
      "grad_norm": 5.5808851538998745,
      "learning_rate": 2.84e-06,
      "loss": 0.9149,
      "step": 284
    },
    {
      "epoch": 0.10782181027144613,
      "grad_norm": 3.2461608503776582,
      "learning_rate": 2.85e-06,
      "loss": 0.9562,
      "step": 285
    },
    {
      "epoch": 0.10820013241274945,
      "grad_norm": 3.016464443847612,
      "learning_rate": 2.8599999999999997e-06,
      "loss": 0.9635,
      "step": 286
    },
    {
      "epoch": 0.10857845455405278,
      "grad_norm": 3.1653672708590936,
      "learning_rate": 2.87e-06,
      "loss": 1.0064,
      "step": 287
    },
    {
      "epoch": 0.1089567766953561,
      "grad_norm": 2.1243065072255907,
      "learning_rate": 2.88e-06,
      "loss": 0.9279,
      "step": 288
    },
    {
      "epoch": 0.10933509883665941,
      "grad_norm": 3.4080159282806712,
      "learning_rate": 2.89e-06,
      "loss": 0.9759,
      "step": 289
    },
    {
      "epoch": 0.10971342097796273,
      "grad_norm": 2.610557409129719,
      "learning_rate": 2.8999999999999998e-06,
      "loss": 0.9787,
      "step": 290
    },
    {
      "epoch": 0.11009174311926606,
      "grad_norm": 2.2107636510154176,
      "learning_rate": 2.91e-06,
      "loss": 0.9296,
      "step": 291
    },
    {
      "epoch": 0.11047006526056938,
      "grad_norm": 4.245908140335627,
      "learning_rate": 2.92e-06,
      "loss": 0.9273,
      "step": 292
    },
    {
      "epoch": 0.1108483874018727,
      "grad_norm": 2.895847446673922,
      "learning_rate": 2.93e-06,
      "loss": 0.9383,
      "step": 293
    },
    {
      "epoch": 0.11122670954317601,
      "grad_norm": 2.704339168426421,
      "learning_rate": 2.94e-06,
      "loss": 0.9153,
      "step": 294
    },
    {
      "epoch": 0.11160503168447933,
      "grad_norm": 2.701813364341608,
      "learning_rate": 2.95e-06,
      "loss": 0.9299,
      "step": 295
    },
    {
      "epoch": 0.11198335382578266,
      "grad_norm": 2.948359459278812,
      "learning_rate": 2.96e-06,
      "loss": 0.9702,
      "step": 296
    },
    {
      "epoch": 0.11236167596708597,
      "grad_norm": 3.377595158199111,
      "learning_rate": 2.97e-06,
      "loss": 0.9554,
      "step": 297
    },
    {
      "epoch": 0.11273999810838929,
      "grad_norm": 2.5213378940105415,
      "learning_rate": 2.98e-06,
      "loss": 0.9312,
      "step": 298
    },
    {
      "epoch": 0.11311832024969261,
      "grad_norm": 4.796315482527464,
      "learning_rate": 2.99e-06,
      "loss": 0.9294,
      "step": 299
    },
    {
      "epoch": 0.11349664239099594,
      "grad_norm": 2.161917946044457,
      "learning_rate": 3e-06,
      "loss": 0.9603,
      "step": 300
    },
    {
      "epoch": 0.11387496453229926,
      "grad_norm": 4.2290402280104145,
      "learning_rate": 3.0099999999999996e-06,
      "loss": 0.9079,
      "step": 301
    },
    {
      "epoch": 0.11425328667360257,
      "grad_norm": 2.7667893528721867,
      "learning_rate": 3.02e-06,
      "loss": 0.953,
      "step": 302
    },
    {
      "epoch": 0.11463160881490589,
      "grad_norm": 9.065359561610483,
      "learning_rate": 3.03e-06,
      "loss": 0.9891,
      "step": 303
    },
    {
      "epoch": 0.1150099309562092,
      "grad_norm": 3.629194869203107,
      "learning_rate": 3.0399999999999997e-06,
      "loss": 0.9434,
      "step": 304
    },
    {
      "epoch": 0.11538825309751254,
      "grad_norm": 3.2434020969746182,
      "learning_rate": 3.0499999999999996e-06,
      "loss": 0.9289,
      "step": 305
    },
    {
      "epoch": 0.11576657523881585,
      "grad_norm": 3.266784032620147,
      "learning_rate": 3.06e-06,
      "loss": 0.941,
      "step": 306
    },
    {
      "epoch": 0.11614489738011917,
      "grad_norm": 2.2252097372145627,
      "learning_rate": 3.07e-06,
      "loss": 0.9197,
      "step": 307
    },
    {
      "epoch": 0.11652321952142249,
      "grad_norm": 2.2906797269719683,
      "learning_rate": 3.0799999999999997e-06,
      "loss": 0.9278,
      "step": 308
    },
    {
      "epoch": 0.11690154166272582,
      "grad_norm": 2.899028879345415,
      "learning_rate": 3.0899999999999996e-06,
      "loss": 0.9177,
      "step": 309
    },
    {
      "epoch": 0.11727986380402913,
      "grad_norm": 1.9374921205584867,
      "learning_rate": 3.1e-06,
      "loss": 0.9049,
      "step": 310
    },
    {
      "epoch": 0.11765818594533245,
      "grad_norm": 1.90674843142603,
      "learning_rate": 3.11e-06,
      "loss": 0.9563,
      "step": 311
    },
    {
      "epoch": 0.11803650808663577,
      "grad_norm": 1.878846884674951,
      "learning_rate": 3.1199999999999998e-06,
      "loss": 0.9139,
      "step": 312
    },
    {
      "epoch": 0.1184148302279391,
      "grad_norm": 1.8411547245015762,
      "learning_rate": 3.1299999999999997e-06,
      "loss": 0.947,
      "step": 313
    },
    {
      "epoch": 0.11879315236924241,
      "grad_norm": 1.6495211524540856,
      "learning_rate": 3.14e-06,
      "loss": 0.8994,
      "step": 314
    },
    {
      "epoch": 0.11917147451054573,
      "grad_norm": 1.979339834494396,
      "learning_rate": 3.15e-06,
      "loss": 0.9425,
      "step": 315
    },
    {
      "epoch": 0.11954979665184905,
      "grad_norm": 1.6881739152797177,
      "learning_rate": 3.16e-06,
      "loss": 0.9079,
      "step": 316
    },
    {
      "epoch": 0.11992811879315236,
      "grad_norm": 1.7476621404963093,
      "learning_rate": 3.1699999999999997e-06,
      "loss": 0.9342,
      "step": 317
    },
    {
      "epoch": 0.1203064409344557,
      "grad_norm": 1.7825714782443438,
      "learning_rate": 3.18e-06,
      "loss": 0.9736,
      "step": 318
    },
    {
      "epoch": 0.12068476307575901,
      "grad_norm": 1.7904157984440023,
      "learning_rate": 3.19e-06,
      "loss": 0.8904,
      "step": 319
    },
    {
      "epoch": 0.12106308521706233,
      "grad_norm": 1.8488826023075036,
      "learning_rate": 3.2e-06,
      "loss": 0.9374,
      "step": 320
    },
    {
      "epoch": 0.12144140735836564,
      "grad_norm": 1.7466001202181465,
      "learning_rate": 3.2099999999999998e-06,
      "loss": 0.9506,
      "step": 321
    },
    {
      "epoch": 0.12181972949966897,
      "grad_norm": 1.9022275763429817,
      "learning_rate": 3.22e-06,
      "loss": 0.9452,
      "step": 322
    },
    {
      "epoch": 0.12219805164097229,
      "grad_norm": 1.62671365850624,
      "learning_rate": 3.23e-06,
      "loss": 0.9063,
      "step": 323
    },
    {
      "epoch": 0.12257637378227561,
      "grad_norm": 1.537323535673334,
      "learning_rate": 3.24e-06,
      "loss": 0.892,
      "step": 324
    },
    {
      "epoch": 0.12295469592357892,
      "grad_norm": 1.6088280546082747,
      "learning_rate": 3.25e-06,
      "loss": 0.9055,
      "step": 325
    },
    {
      "epoch": 0.12333301806488224,
      "grad_norm": 1.754864511511676,
      "learning_rate": 3.2599999999999997e-06,
      "loss": 0.9982,
      "step": 326
    },
    {
      "epoch": 0.12371134020618557,
      "grad_norm": 1.7110520395582398,
      "learning_rate": 3.27e-06,
      "loss": 0.8869,
      "step": 327
    },
    {
      "epoch": 0.12408966234748889,
      "grad_norm": 2.2210658284362976,
      "learning_rate": 3.2799999999999995e-06,
      "loss": 0.9,
      "step": 328
    },
    {
      "epoch": 0.1244679844887922,
      "grad_norm": 2.0718951481844337,
      "learning_rate": 3.29e-06,
      "loss": 0.9474,
      "step": 329
    },
    {
      "epoch": 0.12484630663009552,
      "grad_norm": 1.6483777638825354,
      "learning_rate": 3.2999999999999997e-06,
      "loss": 0.9193,
      "step": 330
    },
    {
      "epoch": 0.12522462877139884,
      "grad_norm": 1.8408500351694481,
      "learning_rate": 3.31e-06,
      "loss": 0.9331,
      "step": 331
    },
    {
      "epoch": 0.12560295091270215,
      "grad_norm": 1.5886399601274244,
      "learning_rate": 3.3199999999999996e-06,
      "loss": 0.9181,
      "step": 332
    },
    {
      "epoch": 0.1259812730540055,
      "grad_norm": 1.5415700759277726,
      "learning_rate": 3.33e-06,
      "loss": 0.9078,
      "step": 333
    },
    {
      "epoch": 0.12635959519530882,
      "grad_norm": 1.5699378541238653,
      "learning_rate": 3.3399999999999998e-06,
      "loss": 0.9415,
      "step": 334
    },
    {
      "epoch": 0.12673791733661213,
      "grad_norm": 1.4355378270145513,
      "learning_rate": 3.35e-06,
      "loss": 0.9328,
      "step": 335
    },
    {
      "epoch": 0.12711623947791545,
      "grad_norm": 1.4472036059899498,
      "learning_rate": 3.3599999999999996e-06,
      "loss": 0.9235,
      "step": 336
    },
    {
      "epoch": 0.12749456161921877,
      "grad_norm": 1.493466705425371,
      "learning_rate": 3.37e-06,
      "loss": 0.917,
      "step": 337
    },
    {
      "epoch": 0.12787288376052208,
      "grad_norm": 1.725222957788955,
      "learning_rate": 3.38e-06,
      "loss": 0.9229,
      "step": 338
    },
    {
      "epoch": 0.1282512059018254,
      "grad_norm": 1.829546156665469,
      "learning_rate": 3.39e-06,
      "loss": 0.9199,
      "step": 339
    },
    {
      "epoch": 0.12862952804312872,
      "grad_norm": 1.562404556848645,
      "learning_rate": 3.3999999999999996e-06,
      "loss": 0.9258,
      "step": 340
    },
    {
      "epoch": 0.12900785018443203,
      "grad_norm": 1.5503184849860385,
      "learning_rate": 3.41e-06,
      "loss": 0.9056,
      "step": 341
    },
    {
      "epoch": 0.12938617232573538,
      "grad_norm": 2.093643266825353,
      "learning_rate": 3.42e-06,
      "loss": 0.9151,
      "step": 342
    },
    {
      "epoch": 0.1297644944670387,
      "grad_norm": 1.5470351610527242,
      "learning_rate": 3.43e-06,
      "loss": 0.9295,
      "step": 343
    },
    {
      "epoch": 0.130142816608342,
      "grad_norm": 1.6415927498606424,
      "learning_rate": 3.4399999999999997e-06,
      "loss": 0.9227,
      "step": 344
    },
    {
      "epoch": 0.13052113874964533,
      "grad_norm": 1.501364967749395,
      "learning_rate": 3.45e-06,
      "loss": 0.9196,
      "step": 345
    },
    {
      "epoch": 0.13089946089094864,
      "grad_norm": 1.4667926955996313,
      "learning_rate": 3.46e-06,
      "loss": 0.9875,
      "step": 346
    },
    {
      "epoch": 0.13127778303225196,
      "grad_norm": 1.4015397895960147,
      "learning_rate": 3.4700000000000002e-06,
      "loss": 0.9174,
      "step": 347
    },
    {
      "epoch": 0.13165610517355528,
      "grad_norm": 1.6317901839112616,
      "learning_rate": 3.4799999999999997e-06,
      "loss": 0.9022,
      "step": 348
    },
    {
      "epoch": 0.1320344273148586,
      "grad_norm": 1.5495030641920218,
      "learning_rate": 3.49e-06,
      "loss": 0.9056,
      "step": 349
    },
    {
      "epoch": 0.1324127494561619,
      "grad_norm": 1.4169162437828007,
      "learning_rate": 3.5e-06,
      "loss": 0.9125,
      "step": 350
    },
    {
      "epoch": 0.13279107159746525,
      "grad_norm": 1.5269510878366184,
      "learning_rate": 3.5099999999999994e-06,
      "loss": 0.9325,
      "step": 351
    },
    {
      "epoch": 0.13316939373876857,
      "grad_norm": 1.4845731562408333,
      "learning_rate": 3.5199999999999998e-06,
      "loss": 0.9119,
      "step": 352
    },
    {
      "epoch": 0.1335477158800719,
      "grad_norm": 1.2998342684154016,
      "learning_rate": 3.5299999999999997e-06,
      "loss": 0.8989,
      "step": 353
    },
    {
      "epoch": 0.1339260380213752,
      "grad_norm": 1.4867481861923495,
      "learning_rate": 3.54e-06,
      "loss": 0.9201,
      "step": 354
    },
    {
      "epoch": 0.13430436016267852,
      "grad_norm": 1.4212824059163913,
      "learning_rate": 3.5499999999999995e-06,
      "loss": 0.9288,
      "step": 355
    },
    {
      "epoch": 0.13468268230398184,
      "grad_norm": 1.3588961307618976,
      "learning_rate": 3.56e-06,
      "loss": 0.9117,
      "step": 356
    },
    {
      "epoch": 0.13506100444528515,
      "grad_norm": 1.4097313807539793,
      "learning_rate": 3.5699999999999997e-06,
      "loss": 0.9139,
      "step": 357
    },
    {
      "epoch": 0.13543932658658847,
      "grad_norm": 1.490782064831479,
      "learning_rate": 3.58e-06,
      "loss": 0.938,
      "step": 358
    },
    {
      "epoch": 0.1358176487278918,
      "grad_norm": 1.2930048652835795,
      "learning_rate": 3.5899999999999995e-06,
      "loss": 0.9023,
      "step": 359
    },
    {
      "epoch": 0.13619597086919513,
      "grad_norm": 1.824182436515982,
      "learning_rate": 3.6e-06,
      "loss": 0.9343,
      "step": 360
    },
    {
      "epoch": 0.13657429301049845,
      "grad_norm": 1.4837219324976698,
      "learning_rate": 3.6099999999999997e-06,
      "loss": 0.9418,
      "step": 361
    },
    {
      "epoch": 0.13695261515180177,
      "grad_norm": 1.3718729917310193,
      "learning_rate": 3.62e-06,
      "loss": 0.9231,
      "step": 362
    },
    {
      "epoch": 0.13733093729310508,
      "grad_norm": 1.3644818822127356,
      "learning_rate": 3.6299999999999995e-06,
      "loss": 0.9093,
      "step": 363
    },
    {
      "epoch": 0.1377092594344084,
      "grad_norm": 1.4274881326706697,
      "learning_rate": 3.64e-06,
      "loss": 0.9077,
      "step": 364
    },
    {
      "epoch": 0.13808758157571172,
      "grad_norm": 1.3169195252885812,
      "learning_rate": 3.6499999999999998e-06,
      "loss": 0.8772,
      "step": 365
    },
    {
      "epoch": 0.13846590371701503,
      "grad_norm": 1.3505673564506786,
      "learning_rate": 3.66e-06,
      "loss": 0.8729,
      "step": 366
    },
    {
      "epoch": 0.13884422585831835,
      "grad_norm": 1.3728815922981648,
      "learning_rate": 3.6699999999999996e-06,
      "loss": 0.91,
      "step": 367
    },
    {
      "epoch": 0.13922254799962167,
      "grad_norm": 1.4225979847364822,
      "learning_rate": 3.68e-06,
      "loss": 0.8862,
      "step": 368
    },
    {
      "epoch": 0.139600870140925,
      "grad_norm": 1.3363118705656714,
      "learning_rate": 3.69e-06,
      "loss": 0.9322,
      "step": 369
    },
    {
      "epoch": 0.13997919228222833,
      "grad_norm": 1.318614371056809,
      "learning_rate": 3.7e-06,
      "loss": 0.926,
      "step": 370
    },
    {
      "epoch": 0.14035751442353164,
      "grad_norm": 1.330484253084181,
      "learning_rate": 3.7099999999999996e-06,
      "loss": 0.9456,
      "step": 371
    },
    {
      "epoch": 0.14073583656483496,
      "grad_norm": 1.3318506320691512,
      "learning_rate": 3.72e-06,
      "loss": 0.9017,
      "step": 372
    },
    {
      "epoch": 0.14111415870613828,
      "grad_norm": 1.3759434761704756,
      "learning_rate": 3.73e-06,
      "loss": 0.8881,
      "step": 373
    },
    {
      "epoch": 0.1414924808474416,
      "grad_norm": 1.3957619030952084,
      "learning_rate": 3.74e-06,
      "loss": 0.9121,
      "step": 374
    },
    {
      "epoch": 0.1418708029887449,
      "grad_norm": 1.3427799016571502,
      "learning_rate": 3.7499999999999997e-06,
      "loss": 0.9106,
      "step": 375
    },
    {
      "epoch": 0.14224912513004823,
      "grad_norm": 44.30080368963616,
      "learning_rate": 3.7599999999999996e-06,
      "loss": 0.8911,
      "step": 376
    },
    {
      "epoch": 0.14262744727135154,
      "grad_norm": 2.2669972347416127,
      "learning_rate": 3.77e-06,
      "loss": 0.933,
      "step": 377
    },
    {
      "epoch": 0.1430057694126549,
      "grad_norm": 1.4829201626961606,
      "learning_rate": 3.78e-06,
      "loss": 0.901,
      "step": 378
    },
    {
      "epoch": 0.1433840915539582,
      "grad_norm": 4.064663928049432,
      "learning_rate": 3.7899999999999997e-06,
      "loss": 0.8942,
      "step": 379
    },
    {
      "epoch": 0.14376241369526152,
      "grad_norm": 1.8169275430345828,
      "learning_rate": 3.7999999999999996e-06,
      "loss": 0.88,
      "step": 380
    },
    {
      "epoch": 0.14414073583656484,
      "grad_norm": 1.903257571166488,
      "learning_rate": 3.81e-06,
      "loss": 0.9286,
      "step": 381
    },
    {
      "epoch": 0.14451905797786815,
      "grad_norm": 1.662557610937424,
      "learning_rate": 3.82e-06,
      "loss": 0.8947,
      "step": 382
    },
    {
      "epoch": 0.14489738011917147,
      "grad_norm": 1.3504615763712993,
      "learning_rate": 3.83e-06,
      "loss": 0.9081,
      "step": 383
    },
    {
      "epoch": 0.1452757022604748,
      "grad_norm": 2.083053759282353,
      "learning_rate": 3.84e-06,
      "loss": 0.9229,
      "step": 384
    },
    {
      "epoch": 0.1456540244017781,
      "grad_norm": 1.5724819369725127,
      "learning_rate": 3.8499999999999996e-06,
      "loss": 0.9019,
      "step": 385
    },
    {
      "epoch": 0.14603234654308142,
      "grad_norm": 1.2833291006046557,
      "learning_rate": 3.8599999999999995e-06,
      "loss": 0.8943,
      "step": 386
    },
    {
      "epoch": 0.14641066868438476,
      "grad_norm": 1.6810072820257926,
      "learning_rate": 3.87e-06,
      "loss": 0.9469,
      "step": 387
    },
    {
      "epoch": 0.14678899082568808,
      "grad_norm": 1.462137670239198,
      "learning_rate": 3.88e-06,
      "loss": 0.885,
      "step": 388
    },
    {
      "epoch": 0.1471673129669914,
      "grad_norm": 1.3544773507596952,
      "learning_rate": 3.89e-06,
      "loss": 0.9223,
      "step": 389
    },
    {
      "epoch": 0.14754563510829471,
      "grad_norm": 1.305788748108731,
      "learning_rate": 3.9e-06,
      "loss": 0.9085,
      "step": 390
    },
    {
      "epoch": 0.14792395724959803,
      "grad_norm": 1.4728433076805145,
      "learning_rate": 3.91e-06,
      "loss": 0.9111,
      "step": 391
    },
    {
      "epoch": 0.14830227939090135,
      "grad_norm": 1.3023289374881166,
      "learning_rate": 3.92e-06,
      "loss": 0.9082,
      "step": 392
    },
    {
      "epoch": 0.14868060153220466,
      "grad_norm": 1.528856941817902,
      "learning_rate": 3.93e-06,
      "loss": 0.8583,
      "step": 393
    },
    {
      "epoch": 0.14905892367350798,
      "grad_norm": 1.2279025499674738,
      "learning_rate": 3.9399999999999995e-06,
      "loss": 0.8943,
      "step": 394
    },
    {
      "epoch": 0.1494372458148113,
      "grad_norm": 1.5480907504889059,
      "learning_rate": 3.95e-06,
      "loss": 0.858,
      "step": 395
    },
    {
      "epoch": 0.14981556795611464,
      "grad_norm": 1.3146063824478018,
      "learning_rate": 3.96e-06,
      "loss": 0.8618,
      "step": 396
    },
    {
      "epoch": 0.15019389009741796,
      "grad_norm": 1.334057857690303,
      "learning_rate": 3.97e-06,
      "loss": 0.9243,
      "step": 397
    },
    {
      "epoch": 0.15057221223872128,
      "grad_norm": 1.3866128005645164,
      "learning_rate": 3.98e-06,
      "loss": 0.9274,
      "step": 398
    },
    {
      "epoch": 0.1509505343800246,
      "grad_norm": 1.2955294219171367,
      "learning_rate": 3.99e-06,
      "loss": 0.9173,
      "step": 399
    },
    {
      "epoch": 0.1509505343800246,
      "eval_loss": 0.9027320742607117,
      "eval_runtime": 27.0581,
      "eval_samples_per_second": 32.707,
      "eval_steps_per_second": 1.035,
      "step": 399
    },
    {
      "epoch": 0.1509505343800246,
      "eval_bench_accuracy_arc_challenge": 0.24285714285714285,
      "eval_bench_accuracy_hellaswag": 0.24,
      "eval_bench_accuracy_mmlu": 0.3739130434782609,
      "eval_bench_average_accuracy": 0.2855900621118012,
      "eval_bench_loss": 4.885084721080044,
      "eval_bench_total_accuracy": 0.27472527472527475,
      "step": 399
    },
    {
      "epoch": 0.1513288565213279,
      "grad_norm": 1.4867956471987611,
      "learning_rate": 4e-06,
      "loss": 0.8442,
      "step": 400
    },
    {
      "epoch": 0.15170717866263123,
      "grad_norm": 1.4418482940385888,
      "learning_rate": 4.01e-06,
      "loss": 0.8851,
      "step": 401
    },
    {
      "epoch": 0.15208550080393454,
      "grad_norm": 1.2367816437008439,
      "learning_rate": 4.02e-06,
      "loss": 0.9016,
      "step": 402
    },
    {
      "epoch": 0.15246382294523786,
      "grad_norm": 1.3381669970164036,
      "learning_rate": 4.03e-06,
      "loss": 0.8967,
      "step": 403
    },
    {
      "epoch": 0.1528421450865412,
      "grad_norm": 1.178040710244701,
      "learning_rate": 4.0399999999999994e-06,
      "loss": 0.9052,
      "step": 404
    },
    {
      "epoch": 0.15322046722784452,
      "grad_norm": 1.354680203607332,
      "learning_rate": 4.049999999999999e-06,
      "loss": 0.916,
      "step": 405
    },
    {
      "epoch": 0.15359878936914784,
      "grad_norm": 1.2478760852613116,
      "learning_rate": 4.059999999999999e-06,
      "loss": 0.8918,
      "step": 406
    },
    {
      "epoch": 0.15397711151045115,
      "grad_norm": 1.3580886429686791,
      "learning_rate": 4.07e-06,
      "loss": 0.8769,
      "step": 407
    },
    {
      "epoch": 0.15435543365175447,
      "grad_norm": 1.4849252692119392,
      "learning_rate": 4.08e-06,
      "loss": 0.8985,
      "step": 408
    },
    {
      "epoch": 0.1547337557930578,
      "grad_norm": 1.234446053198778,
      "learning_rate": 4.09e-06,
      "loss": 0.8681,
      "step": 409
    },
    {
      "epoch": 0.1551120779343611,
      "grad_norm": 1.4907001456714162,
      "learning_rate": 4.1e-06,
      "loss": 0.9035,
      "step": 410
    },
    {
      "epoch": 0.15549040007566442,
      "grad_norm": 1.1935520171507346,
      "learning_rate": 4.1100000000000005e-06,
      "loss": 0.8939,
      "step": 411
    },
    {
      "epoch": 0.15586872221696774,
      "grad_norm": 1.3431797561411594,
      "learning_rate": 4.1199999999999995e-06,
      "loss": 0.8892,
      "step": 412
    },
    {
      "epoch": 0.15624704435827108,
      "grad_norm": 1.1858701499867044,
      "learning_rate": 4.129999999999999e-06,
      "loss": 0.8952,
      "step": 413
    },
    {
      "epoch": 0.1566253664995744,
      "grad_norm": 1.3160462921208504,
      "learning_rate": 4.139999999999999e-06,
      "loss": 0.9104,
      "step": 414
    },
    {
      "epoch": 0.15700368864087771,
      "grad_norm": 1.205303163621962,
      "learning_rate": 4.15e-06,
      "loss": 0.8989,
      "step": 415
    },
    {
      "epoch": 0.15738201078218103,
      "grad_norm": 1.2116662309617274,
      "learning_rate": 4.16e-06,
      "loss": 0.9178,
      "step": 416
    },
    {
      "epoch": 0.15776033292348435,
      "grad_norm": 1.1758637546414648,
      "learning_rate": 4.17e-06,
      "loss": 0.8792,
      "step": 417
    },
    {
      "epoch": 0.15813865506478766,
      "grad_norm": 1.2552462548629688,
      "learning_rate": 4.18e-06,
      "loss": 0.8981,
      "step": 418
    },
    {
      "epoch": 0.15851697720609098,
      "grad_norm": 1.206264514397755,
      "learning_rate": 4.1900000000000005e-06,
      "loss": 0.9058,
      "step": 419
    },
    {
      "epoch": 0.1588952993473943,
      "grad_norm": 1.2231014501429258,
      "learning_rate": 4.2e-06,
      "loss": 0.899,
      "step": 420
    },
    {
      "epoch": 0.15927362148869761,
      "grad_norm": 1.2120070273790158,
      "learning_rate": 4.2099999999999995e-06,
      "loss": 0.8449,
      "step": 421
    },
    {
      "epoch": 0.15965194363000096,
      "grad_norm": 1.225434870357441,
      "learning_rate": 4.219999999999999e-06,
      "loss": 0.8925,
      "step": 422
    },
    {
      "epoch": 0.16003026577130428,
      "grad_norm": 1.2700536143173544,
      "learning_rate": 4.23e-06,
      "loss": 0.8948,
      "step": 423
    },
    {
      "epoch": 0.1604085879126076,
      "grad_norm": 1.327617668860312,
      "learning_rate": 4.24e-06,
      "loss": 0.8808,
      "step": 424
    },
    {
      "epoch": 0.1607869100539109,
      "grad_norm": 1.2286005573930583,
      "learning_rate": 4.25e-06,
      "loss": 0.8885,
      "step": 425
    },
    {
      "epoch": 0.16116523219521423,
      "grad_norm": 1.265158345195646,
      "learning_rate": 4.26e-06,
      "loss": 0.8973,
      "step": 426
    },
    {
      "epoch": 0.16154355433651754,
      "grad_norm": 1.2113247771231779,
      "learning_rate": 4.27e-06,
      "loss": 0.88,
      "step": 427
    },
    {
      "epoch": 0.16192187647782086,
      "grad_norm": 1.1981923822069018,
      "learning_rate": 4.28e-06,
      "loss": 0.8812,
      "step": 428
    },
    {
      "epoch": 0.16230019861912418,
      "grad_norm": 1.269210905108754,
      "learning_rate": 4.29e-06,
      "loss": 0.951,
      "step": 429
    },
    {
      "epoch": 0.1626785207604275,
      "grad_norm": 1.270040077896289,
      "learning_rate": 4.2999999999999995e-06,
      "loss": 0.8502,
      "step": 430
    },
    {
      "epoch": 0.16305684290173084,
      "grad_norm": 1.2459835235482208,
      "learning_rate": 4.309999999999999e-06,
      "loss": 0.9249,
      "step": 431
    },
    {
      "epoch": 0.16343516504303415,
      "grad_norm": 1.2065849160511677,
      "learning_rate": 4.32e-06,
      "loss": 0.8569,
      "step": 432
    },
    {
      "epoch": 0.16381348718433747,
      "grad_norm": 1.3240957525319628,
      "learning_rate": 4.33e-06,
      "loss": 0.8378,
      "step": 433
    },
    {
      "epoch": 0.1641918093256408,
      "grad_norm": 1.308494624204772,
      "learning_rate": 4.34e-06,
      "loss": 0.8853,
      "step": 434
    },
    {
      "epoch": 0.1645701314669441,
      "grad_norm": 1.2876226830148083,
      "learning_rate": 4.35e-06,
      "loss": 0.8999,
      "step": 435
    },
    {
      "epoch": 0.16494845360824742,
      "grad_norm": 1.3895344761060464,
      "learning_rate": 4.36e-06,
      "loss": 0.8995,
      "step": 436
    },
    {
      "epoch": 0.16532677574955074,
      "grad_norm": 1.2397074052657744,
      "learning_rate": 4.37e-06,
      "loss": 0.8787,
      "step": 437
    },
    {
      "epoch": 0.16570509789085405,
      "grad_norm": 1.2286411029399464,
      "learning_rate": 4.3799999999999996e-06,
      "loss": 0.8968,
      "step": 438
    },
    {
      "epoch": 0.16608342003215737,
      "grad_norm": 1.231038186520652,
      "learning_rate": 4.3899999999999995e-06,
      "loss": 0.8781,
      "step": 439
    },
    {
      "epoch": 0.16646174217346071,
      "grad_norm": 1.2138487844408843,
      "learning_rate": 4.4e-06,
      "loss": 0.8698,
      "step": 440
    },
    {
      "epoch": 0.16684006431476403,
      "grad_norm": 1.3027744892443913,
      "learning_rate": 4.41e-06,
      "loss": 0.9253,
      "step": 441
    },
    {
      "epoch": 0.16721838645606735,
      "grad_norm": 1.2467659827353952,
      "learning_rate": 4.42e-06,
      "loss": 0.9121,
      "step": 442
    },
    {
      "epoch": 0.16759670859737066,
      "grad_norm": 1.1589200132022377,
      "learning_rate": 4.43e-06,
      "loss": 0.8803,
      "step": 443
    },
    {
      "epoch": 0.16797503073867398,
      "grad_norm": 1.2200621136986902,
      "learning_rate": 4.44e-06,
      "loss": 0.9079,
      "step": 444
    },
    {
      "epoch": 0.1683533528799773,
      "grad_norm": 1.1747935123553643,
      "learning_rate": 4.45e-06,
      "loss": 0.8766,
      "step": 445
    },
    {
      "epoch": 0.1687316750212806,
      "grad_norm": 1.1865214460906777,
      "learning_rate": 4.46e-06,
      "loss": 0.9068,
      "step": 446
    },
    {
      "epoch": 0.16910999716258393,
      "grad_norm": 1.2579950961305297,
      "learning_rate": 4.4699999999999996e-06,
      "loss": 0.8815,
      "step": 447
    },
    {
      "epoch": 0.16948831930388725,
      "grad_norm": 1.226665097174107,
      "learning_rate": 4.48e-06,
      "loss": 0.9327,
      "step": 448
    },
    {
      "epoch": 0.1698666414451906,
      "grad_norm": 1.1931395850546989,
      "learning_rate": 4.49e-06,
      "loss": 0.8796,
      "step": 449
    },
    {
      "epoch": 0.1702449635864939,
      "grad_norm": 1.202501530652917,
      "learning_rate": 4.5e-06,
      "loss": 0.8931,
      "step": 450
    },
    {
      "epoch": 0.17062328572779722,
      "grad_norm": 1.1807025967685065,
      "learning_rate": 4.509999999999999e-06,
      "loss": 0.8887,
      "step": 451
    },
    {
      "epoch": 0.17100160786910054,
      "grad_norm": 1.219222521929812,
      "learning_rate": 4.519999999999999e-06,
      "loss": 0.8999,
      "step": 452
    },
    {
      "epoch": 0.17137993001040386,
      "grad_norm": 1.234613051649134,
      "learning_rate": 4.53e-06,
      "loss": 0.8439,
      "step": 453
    },
    {
      "epoch": 0.17175825215170717,
      "grad_norm": 1.2268814413232634,
      "learning_rate": 4.54e-06,
      "loss": 0.8679,
      "step": 454
    },
    {
      "epoch": 0.1721365742930105,
      "grad_norm": 1.2687792576706662,
      "learning_rate": 4.55e-06,
      "loss": 0.9137,
      "step": 455
    },
    {
      "epoch": 0.1725148964343138,
      "grad_norm": 1.259597511238193,
      "learning_rate": 4.5599999999999995e-06,
      "loss": 0.8929,
      "step": 456
    },
    {
      "epoch": 0.17289321857561712,
      "grad_norm": 1.1601209722807053,
      "learning_rate": 4.57e-06,
      "loss": 0.8989,
      "step": 457
    },
    {
      "epoch": 0.17327154071692047,
      "grad_norm": 1.1337571129482695,
      "learning_rate": 4.58e-06,
      "loss": 0.8867,
      "step": 458
    },
    {
      "epoch": 0.17364986285822379,
      "grad_norm": 1.2315099804928107,
      "learning_rate": 4.589999999999999e-06,
      "loss": 0.8766,
      "step": 459
    },
    {
      "epoch": 0.1740281849995271,
      "grad_norm": 1.1590598116825013,
      "learning_rate": 4.599999999999999e-06,
      "loss": 0.8996,
      "step": 460
    },
    {
      "epoch": 0.17440650714083042,
      "grad_norm": 1.2223724961641853,
      "learning_rate": 4.61e-06,
      "loss": 0.8885,
      "step": 461
    },
    {
      "epoch": 0.17478482928213374,
      "grad_norm": 1.2563659855924223,
      "learning_rate": 4.62e-06,
      "loss": 0.9316,
      "step": 462
    },
    {
      "epoch": 0.17516315142343705,
      "grad_norm": 1.2219308373205684,
      "learning_rate": 4.63e-06,
      "loss": 0.9402,
      "step": 463
    },
    {
      "epoch": 0.17554147356474037,
      "grad_norm": 1.2529933281060042,
      "learning_rate": 4.64e-06,
      "loss": 0.8425,
      "step": 464
    },
    {
      "epoch": 0.17591979570604369,
      "grad_norm": 1.1519152308086784,
      "learning_rate": 4.65e-06,
      "loss": 0.8335,
      "step": 465
    },
    {
      "epoch": 0.176298117847347,
      "grad_norm": 1.1993447663063845,
      "learning_rate": 4.66e-06,
      "loss": 0.8423,
      "step": 466
    },
    {
      "epoch": 0.17667643998865035,
      "grad_norm": 1.2393551988442821,
      "learning_rate": 4.669999999999999e-06,
      "loss": 0.8766,
      "step": 467
    },
    {
      "epoch": 0.17705476212995366,
      "grad_norm": 1.1568166146377072,
      "learning_rate": 4.679999999999999e-06,
      "loss": 0.913,
      "step": 468
    },
    {
      "epoch": 0.17743308427125698,
      "grad_norm": 1.2535994832897241,
      "learning_rate": 4.69e-06,
      "loss": 0.8611,
      "step": 469
    },
    {
      "epoch": 0.1778114064125603,
      "grad_norm": 1.2581510292576754,
      "learning_rate": 4.7e-06,
      "loss": 0.852,
      "step": 470
    },
    {
      "epoch": 0.1781897285538636,
      "grad_norm": 1.185843568335289,
      "learning_rate": 4.71e-06,
      "loss": 0.8712,
      "step": 471
    },
    {
      "epoch": 0.17856805069516693,
      "grad_norm": 1.1762961141384334,
      "learning_rate": 4.72e-06,
      "loss": 0.8848,
      "step": 472
    },
    {
      "epoch": 0.17894637283647025,
      "grad_norm": 1.2378038953878985,
      "learning_rate": 4.7300000000000005e-06,
      "loss": 0.89,
      "step": 473
    },
    {
      "epoch": 0.17932469497777356,
      "grad_norm": 1.2303598909876003,
      "learning_rate": 4.74e-06,
      "loss": 0.9019,
      "step": 474
    },
    {
      "epoch": 0.1797030171190769,
      "grad_norm": 1.3055168080029775,
      "learning_rate": 4.749999999999999e-06,
      "loss": 0.8886,
      "step": 475
    },
    {
      "epoch": 0.18008133926038022,
      "grad_norm": 1.263816208541402,
      "learning_rate": 4.759999999999999e-06,
      "loss": 0.8934,
      "step": 476
    },
    {
      "epoch": 0.18045966140168354,
      "grad_norm": 1.2304160263194301,
      "learning_rate": 4.769999999999999e-06,
      "loss": 0.8334,
      "step": 477
    },
    {
      "epoch": 0.18083798354298686,
      "grad_norm": 1.16427739617554,
      "learning_rate": 4.78e-06,
      "loss": 0.8933,
      "step": 478
    },
    {
      "epoch": 0.18121630568429017,
      "grad_norm": 1.2928340654165948,
      "learning_rate": 4.79e-06,
      "loss": 0.9091,
      "step": 479
    },
    {
      "epoch": 0.1815946278255935,
      "grad_norm": 1.2237270548636812,
      "learning_rate": 4.8e-06,
      "loss": 0.8894,
      "step": 480
    },
    {
      "epoch": 0.1819729499668968,
      "grad_norm": 1.2973745239107866,
      "learning_rate": 4.81e-06,
      "loss": 0.8827,
      "step": 481
    },
    {
      "epoch": 0.18235127210820012,
      "grad_norm": 1.2192171355443393,
      "learning_rate": 4.8200000000000004e-06,
      "loss": 0.842,
      "step": 482
    },
    {
      "epoch": 0.18272959424950344,
      "grad_norm": 1.1825464816429376,
      "learning_rate": 4.8299999999999995e-06,
      "loss": 0.8974,
      "step": 483
    },
    {
      "epoch": 0.18310791639080679,
      "grad_norm": 1.2357877717915002,
      "learning_rate": 4.839999999999999e-06,
      "loss": 0.8713,
      "step": 484
    },
    {
      "epoch": 0.1834862385321101,
      "grad_norm": 1.2724832467234655,
      "learning_rate": 4.849999999999999e-06,
      "loss": 0.8916,
      "step": 485
    },
    {
      "epoch": 0.18386456067341342,
      "grad_norm": 1.2402819428437333,
      "learning_rate": 4.86e-06,
      "loss": 0.9006,
      "step": 486
    },
    {
      "epoch": 0.18424288281471674,
      "grad_norm": 1.253080289206958,
      "learning_rate": 4.87e-06,
      "loss": 0.8552,
      "step": 487
    },
    {
      "epoch": 0.18462120495602005,
      "grad_norm": 1.20114987062819,
      "learning_rate": 4.88e-06,
      "loss": 0.8646,
      "step": 488
    },
    {
      "epoch": 0.18499952709732337,
      "grad_norm": 1.2698388666443412,
      "learning_rate": 4.89e-06,
      "loss": 0.9058,
      "step": 489
    },
    {
      "epoch": 0.18537784923862669,
      "grad_norm": 1.255138008138629,
      "learning_rate": 4.9000000000000005e-06,
      "loss": 0.9045,
      "step": 490
    },
    {
      "epoch": 0.18575617137993,
      "grad_norm": 1.173366935458501,
      "learning_rate": 4.91e-06,
      "loss": 0.8653,
      "step": 491
    },
    {
      "epoch": 0.18613449352123332,
      "grad_norm": 1.2544859383454867,
      "learning_rate": 4.9199999999999995e-06,
      "loss": 0.8577,
      "step": 492
    },
    {
      "epoch": 0.18651281566253666,
      "grad_norm": 1.1732808685881084,
      "learning_rate": 4.929999999999999e-06,
      "loss": 0.8551,
      "step": 493
    },
    {
      "epoch": 0.18689113780383998,
      "grad_norm": 1.2265764031917046,
      "learning_rate": 4.94e-06,
      "loss": 0.8726,
      "step": 494
    },
    {
      "epoch": 0.1872694599451433,
      "grad_norm": 1.2234524388802157,
      "learning_rate": 4.95e-06,
      "loss": 0.8833,
      "step": 495
    },
    {
      "epoch": 0.1876477820864466,
      "grad_norm": 1.2488343163013593,
      "learning_rate": 4.96e-06,
      "loss": 0.8704,
      "step": 496
    },
    {
      "epoch": 0.18802610422774993,
      "grad_norm": 1.1667370629188312,
      "learning_rate": 4.97e-06,
      "loss": 0.8637,
      "step": 497
    },
    {
      "epoch": 0.18840442636905325,
      "grad_norm": 1.1300202443780525,
      "learning_rate": 4.980000000000001e-06,
      "loss": 0.8222,
      "step": 498
    },
    {
      "epoch": 0.18878274851035656,
      "grad_norm": 1.2105094043051028,
      "learning_rate": 4.99e-06,
      "loss": 0.8172,
      "step": 499
    },
    {
      "epoch": 0.18916107065165988,
      "grad_norm": 1.147109513607525,
      "learning_rate": 4.9999999999999996e-06,
      "loss": 0.8718,
      "step": 500
    },
    {
      "epoch": 0.1895393927929632,
      "grad_norm": 1.186254501579871,
      "learning_rate": 5.0099999999999995e-06,
      "loss": 0.8672,
      "step": 501
    },
    {
      "epoch": 0.18991771493426654,
      "grad_norm": 1.1921470006777564,
      "learning_rate": 5.019999999999999e-06,
      "loss": 0.8984,
      "step": 502
    },
    {
      "epoch": 0.19029603707556986,
      "grad_norm": 1.204441588496536,
      "learning_rate": 5.03e-06,
      "loss": 0.8933,
      "step": 503
    },
    {
      "epoch": 0.19067435921687317,
      "grad_norm": 1.176488402672726,
      "learning_rate": 5.04e-06,
      "loss": 0.8179,
      "step": 504
    },
    {
      "epoch": 0.1910526813581765,
      "grad_norm": 1.1591890939118275,
      "learning_rate": 5.05e-06,
      "loss": 0.8994,
      "step": 505
    },
    {
      "epoch": 0.1914310034994798,
      "grad_norm": 1.1844780849489716,
      "learning_rate": 5.059999999999999e-06,
      "loss": 0.9002,
      "step": 506
    },
    {
      "epoch": 0.19180932564078312,
      "grad_norm": 1.1340897482563235,
      "learning_rate": 5.07e-06,
      "loss": 0.8629,
      "step": 507
    },
    {
      "epoch": 0.19218764778208644,
      "grad_norm": 1.242695087632576,
      "learning_rate": 5.08e-06,
      "loss": 0.893,
      "step": 508
    },
    {
      "epoch": 0.19256596992338976,
      "grad_norm": 1.21618537349293,
      "learning_rate": 5.0899999999999995e-06,
      "loss": 0.8874,
      "step": 509
    },
    {
      "epoch": 0.19294429206469307,
      "grad_norm": 1.2081469798752933,
      "learning_rate": 5.0999999999999995e-06,
      "loss": 0.8672,
      "step": 510
    },
    {
      "epoch": 0.19332261420599642,
      "grad_norm": 1.1486757711757551,
      "learning_rate": 5.11e-06,
      "loss": 0.8445,
      "step": 511
    },
    {
      "epoch": 0.19370093634729973,
      "grad_norm": 1.160176382154706,
      "learning_rate": 5.12e-06,
      "loss": 0.8689,
      "step": 512
    },
    {
      "epoch": 0.19407925848860305,
      "grad_norm": 1.1842115955863446,
      "learning_rate": 5.13e-06,
      "loss": 0.887,
      "step": 513
    },
    {
      "epoch": 0.19445758062990637,
      "grad_norm": 1.1622953235550992,
      "learning_rate": 5.139999999999999e-06,
      "loss": 0.8891,
      "step": 514
    },
    {
      "epoch": 0.19483590277120968,
      "grad_norm": 1.2278834007146076,
      "learning_rate": 5.15e-06,
      "loss": 0.9542,
      "step": 515
    },
    {
      "epoch": 0.195214224912513,
      "grad_norm": 1.1688897803585725,
      "learning_rate": 5.16e-06,
      "loss": 0.842,
      "step": 516
    },
    {
      "epoch": 0.19559254705381632,
      "grad_norm": 1.169443235508946,
      "learning_rate": 5.17e-06,
      "loss": 0.926,
      "step": 517
    },
    {
      "epoch": 0.19597086919511963,
      "grad_norm": 1.190101722103473,
      "learning_rate": 5.1799999999999995e-06,
      "loss": 0.9012,
      "step": 518
    },
    {
      "epoch": 0.19634919133642295,
      "grad_norm": 1.1139938105404836,
      "learning_rate": 5.19e-06,
      "loss": 0.8355,
      "step": 519
    },
    {
      "epoch": 0.1967275134777263,
      "grad_norm": 1.1644272208548614,
      "learning_rate": 5.2e-06,
      "loss": 0.8508,
      "step": 520
    },
    {
      "epoch": 0.1971058356190296,
      "grad_norm": 1.188005585447595,
      "learning_rate": 5.21e-06,
      "loss": 0.8884,
      "step": 521
    },
    {
      "epoch": 0.19748415776033293,
      "grad_norm": 1.162381129570287,
      "learning_rate": 5.219999999999999e-06,
      "loss": 0.8494,
      "step": 522
    },
    {
      "epoch": 0.19786247990163625,
      "grad_norm": 1.1379792376540319,
      "learning_rate": 5.23e-06,
      "loss": 0.8427,
      "step": 523
    },
    {
      "epoch": 0.19824080204293956,
      "grad_norm": 1.163441860737916,
      "learning_rate": 5.24e-06,
      "loss": 0.8831,
      "step": 524
    },
    {
      "epoch": 0.19861912418424288,
      "grad_norm": 1.1604063632172568,
      "learning_rate": 5.25e-06,
      "loss": 0.8898,
      "step": 525
    },
    {
      "epoch": 0.1989974463255462,
      "grad_norm": 1.1325670759545932,
      "learning_rate": 5.26e-06,
      "loss": 0.8735,
      "step": 526
    },
    {
      "epoch": 0.1993757684668495,
      "grad_norm": 1.1790821072251718,
      "learning_rate": 5.2699999999999995e-06,
      "loss": 0.8343,
      "step": 527
    },
    {
      "epoch": 0.19975409060815283,
      "grad_norm": 1.1453742135606537,
      "learning_rate": 5.28e-06,
      "loss": 0.8566,
      "step": 528
    },
    {
      "epoch": 0.20013241274945617,
      "grad_norm": 1.13296207138768,
      "learning_rate": 5.29e-06,
      "loss": 0.8659,
      "step": 529
    },
    {
      "epoch": 0.2005107348907595,
      "grad_norm": 1.1666609028219261,
      "learning_rate": 5.299999999999999e-06,
      "loss": 0.8853,
      "step": 530
    },
    {
      "epoch": 0.2008890570320628,
      "grad_norm": 1.1656374685369397,
      "learning_rate": 5.309999999999999e-06,
      "loss": 0.9086,
      "step": 531
    },
    {
      "epoch": 0.20126737917336612,
      "grad_norm": 1.1343885551812507,
      "learning_rate": 5.32e-06,
      "loss": 0.8379,
      "step": 532
    },
    {
      "epoch": 0.20126737917336612,
      "eval_loss": 0.8767463564872742,
      "eval_runtime": 26.8872,
      "eval_samples_per_second": 32.915,
      "eval_steps_per_second": 1.041,
      "step": 532
    },
    {
      "epoch": 0.20126737917336612,
      "eval_bench_accuracy_arc_challenge": 0.24285714285714285,
      "eval_bench_accuracy_hellaswag": 0.275,
      "eval_bench_accuracy_mmlu": 0.3391304347826087,
      "eval_bench_average_accuracy": 0.2856625258799172,
      "eval_bench_loss": 5.605643824527138,
      "eval_bench_total_accuracy": 0.2813186813186813,
      "step": 532
    },
    {
      "epoch": 0.20164570131466944,
      "grad_norm": 1.1898287763707267,
      "learning_rate": 5.33e-06,
      "loss": 0.8633,
      "step": 533
    },
    {
      "epoch": 0.20202402345597276,
      "grad_norm": 1.2061752853772802,
      "learning_rate": 5.34e-06,
      "loss": 0.8537,
      "step": 534
    },
    {
      "epoch": 0.20240234559727607,
      "grad_norm": 1.1524730070815266,
      "learning_rate": 5.35e-06,
      "loss": 0.8658,
      "step": 535
    },
    {
      "epoch": 0.2027806677385794,
      "grad_norm": 1.2112053959243978,
      "learning_rate": 5.36e-06,
      "loss": 0.8658,
      "step": 536
    },
    {
      "epoch": 0.2031589898798827,
      "grad_norm": 1.1062007713391508,
      "learning_rate": 5.37e-06,
      "loss": 0.8695,
      "step": 537
    },
    {
      "epoch": 0.20353731202118605,
      "grad_norm": 1.1454209056836882,
      "learning_rate": 5.379999999999999e-06,
      "loss": 0.8411,
      "step": 538
    },
    {
      "epoch": 0.20391563416248937,
      "grad_norm": 1.1969213700372077,
      "learning_rate": 5.389999999999999e-06,
      "loss": 0.8262,
      "step": 539
    },
    {
      "epoch": 0.20429395630379268,
      "grad_norm": 1.1817755878296146,
      "learning_rate": 5.4e-06,
      "loss": 0.8928,
      "step": 540
    },
    {
      "epoch": 0.204672278445096,
      "grad_norm": 1.2881214697120862,
      "learning_rate": 5.41e-06,
      "loss": 0.8755,
      "step": 541
    },
    {
      "epoch": 0.20505060058639932,
      "grad_norm": 1.1803409039809667,
      "learning_rate": 5.42e-06,
      "loss": 0.8728,
      "step": 542
    },
    {
      "epoch": 0.20542892272770263,
      "grad_norm": 1.2147547833072705,
      "learning_rate": 5.43e-06,
      "loss": 0.8673,
      "step": 543
    },
    {
      "epoch": 0.20580724486900595,
      "grad_norm": 1.111022507543289,
      "learning_rate": 5.4400000000000004e-06,
      "loss": 0.8572,
      "step": 544
    },
    {
      "epoch": 0.20618556701030927,
      "grad_norm": 1.229625708529713,
      "learning_rate": 5.45e-06,
      "loss": 0.9064,
      "step": 545
    },
    {
      "epoch": 0.2065638891516126,
      "grad_norm": 1.1293738392645483,
      "learning_rate": 5.459999999999999e-06,
      "loss": 0.8504,
      "step": 546
    },
    {
      "epoch": 0.20694221129291593,
      "grad_norm": 1.1526707564326522,
      "learning_rate": 5.469999999999999e-06,
      "loss": 0.8722,
      "step": 547
    },
    {
      "epoch": 0.20732053343421925,
      "grad_norm": 1.1056906302195102,
      "learning_rate": 5.48e-06,
      "loss": 0.8253,
      "step": 548
    },
    {
      "epoch": 0.20769885557552256,
      "grad_norm": 1.1541954114677542,
      "learning_rate": 5.49e-06,
      "loss": 0.8475,
      "step": 549
    },
    {
      "epoch": 0.20807717771682588,
      "grad_norm": 1.151670600398325,
      "learning_rate": 5.5e-06,
      "loss": 0.8372,
      "step": 550
    },
    {
      "epoch": 0.2084554998581292,
      "grad_norm": 1.157820909806914,
      "learning_rate": 5.51e-06,
      "loss": 0.8595,
      "step": 551
    },
    {
      "epoch": 0.2088338219994325,
      "grad_norm": 1.1605316476134264,
      "learning_rate": 5.52e-06,
      "loss": 0.8595,
      "step": 552
    },
    {
      "epoch": 0.20921214414073583,
      "grad_norm": 1.1898854269979218,
      "learning_rate": 5.53e-06,
      "loss": 0.8499,
      "step": 553
    },
    {
      "epoch": 0.20959046628203915,
      "grad_norm": 1.1432985309555297,
      "learning_rate": 5.5399999999999995e-06,
      "loss": 0.9105,
      "step": 554
    },
    {
      "epoch": 0.2099687884233425,
      "grad_norm": 1.1991072095190312,
      "learning_rate": 5.549999999999999e-06,
      "loss": 0.9184,
      "step": 555
    },
    {
      "epoch": 0.2103471105646458,
      "grad_norm": 1.140264913482887,
      "learning_rate": 5.559999999999999e-06,
      "loss": 0.8663,
      "step": 556
    },
    {
      "epoch": 0.21072543270594912,
      "grad_norm": 1.1185725137493638,
      "learning_rate": 5.57e-06,
      "loss": 0.9098,
      "step": 557
    },
    {
      "epoch": 0.21110375484725244,
      "grad_norm": 1.156695278835195,
      "learning_rate": 5.58e-06,
      "loss": 0.8781,
      "step": 558
    },
    {
      "epoch": 0.21148207698855576,
      "grad_norm": 1.145333592771482,
      "learning_rate": 5.59e-06,
      "loss": 0.882,
      "step": 559
    },
    {
      "epoch": 0.21186039912985907,
      "grad_norm": 1.1762140502072864,
      "learning_rate": 5.6e-06,
      "loss": 0.8269,
      "step": 560
    },
    {
      "epoch": 0.2122387212711624,
      "grad_norm": 1.1607104680787836,
      "learning_rate": 5.61e-06,
      "loss": 0.8718,
      "step": 561
    },
    {
      "epoch": 0.2126170434124657,
      "grad_norm": 1.1469573147450298,
      "learning_rate": 5.6199999999999996e-06,
      "loss": 0.9056,
      "step": 562
    },
    {
      "epoch": 0.21299536555376902,
      "grad_norm": 1.1193447632576843,
      "learning_rate": 5.6299999999999995e-06,
      "loss": 0.8501,
      "step": 563
    },
    {
      "epoch": 0.21337368769507237,
      "grad_norm": 1.136879874832253,
      "learning_rate": 5.639999999999999e-06,
      "loss": 0.8124,
      "step": 564
    },
    {
      "epoch": 0.21375200983637568,
      "grad_norm": 1.1284818158744658,
      "learning_rate": 5.65e-06,
      "loss": 0.8676,
      "step": 565
    },
    {
      "epoch": 0.214130331977679,
      "grad_norm": 1.2698716712465286,
      "learning_rate": 5.66e-06,
      "loss": 0.8661,
      "step": 566
    },
    {
      "epoch": 0.21450865411898232,
      "grad_norm": 1.153073394080358,
      "learning_rate": 5.67e-06,
      "loss": 0.8164,
      "step": 567
    },
    {
      "epoch": 0.21488697626028563,
      "grad_norm": 1.187929464303015,
      "learning_rate": 5.68e-06,
      "loss": 0.8803,
      "step": 568
    },
    {
      "epoch": 0.21526529840158895,
      "grad_norm": 1.1011027732459755,
      "learning_rate": 5.69e-06,
      "loss": 0.8709,
      "step": 569
    },
    {
      "epoch": 0.21564362054289227,
      "grad_norm": 1.104661943825339,
      "learning_rate": 5.7e-06,
      "loss": 0.8408,
      "step": 570
    },
    {
      "epoch": 0.21602194268419558,
      "grad_norm": 1.1237999429331513,
      "learning_rate": 5.7099999999999995e-06,
      "loss": 0.8316,
      "step": 571
    },
    {
      "epoch": 0.2164002648254989,
      "grad_norm": 1.188002832097036,
      "learning_rate": 5.7199999999999994e-06,
      "loss": 0.8431,
      "step": 572
    },
    {
      "epoch": 0.21677858696680224,
      "grad_norm": 1.1510459825305048,
      "learning_rate": 5.73e-06,
      "loss": 0.8847,
      "step": 573
    },
    {
      "epoch": 0.21715690910810556,
      "grad_norm": 1.0954180332540966,
      "learning_rate": 5.74e-06,
      "loss": 0.8544,
      "step": 574
    },
    {
      "epoch": 0.21753523124940888,
      "grad_norm": 1.1472545717374318,
      "learning_rate": 5.75e-06,
      "loss": 0.8249,
      "step": 575
    },
    {
      "epoch": 0.2179135533907122,
      "grad_norm": 1.175641095732617,
      "learning_rate": 5.76e-06,
      "loss": 0.8614,
      "step": 576
    },
    {
      "epoch": 0.2182918755320155,
      "grad_norm": 1.116355053736543,
      "learning_rate": 5.769999999999999e-06,
      "loss": 0.8405,
      "step": 577
    },
    {
      "epoch": 0.21867019767331883,
      "grad_norm": 1.1157321259442492,
      "learning_rate": 5.78e-06,
      "loss": 0.8786,
      "step": 578
    },
    {
      "epoch": 0.21904851981462214,
      "grad_norm": 1.1931582815103652,
      "learning_rate": 5.79e-06,
      "loss": 0.8904,
      "step": 579
    },
    {
      "epoch": 0.21942684195592546,
      "grad_norm": 1.184066717780273,
      "learning_rate": 5.7999999999999995e-06,
      "loss": 0.8508,
      "step": 580
    },
    {
      "epoch": 0.21980516409722878,
      "grad_norm": 1.161154664599336,
      "learning_rate": 5.8099999999999994e-06,
      "loss": 0.9202,
      "step": 581
    },
    {
      "epoch": 0.22018348623853212,
      "grad_norm": 1.2235874832602252,
      "learning_rate": 5.82e-06,
      "loss": 0.8361,
      "step": 582
    },
    {
      "epoch": 0.22056180837983544,
      "grad_norm": 1.1262137082837416,
      "learning_rate": 5.83e-06,
      "loss": 0.8566,
      "step": 583
    },
    {
      "epoch": 0.22094013052113876,
      "grad_norm": 1.2072112047436216,
      "learning_rate": 5.84e-06,
      "loss": 0.8632,
      "step": 584
    },
    {
      "epoch": 0.22131845266244207,
      "grad_norm": 1.1490940800541938,
      "learning_rate": 5.849999999999999e-06,
      "loss": 0.8593,
      "step": 585
    },
    {
      "epoch": 0.2216967748037454,
      "grad_norm": 1.207791799143847,
      "learning_rate": 5.86e-06,
      "loss": 0.8556,
      "step": 586
    },
    {
      "epoch": 0.2220750969450487,
      "grad_norm": 1.1526196801211563,
      "learning_rate": 5.87e-06,
      "loss": 0.8606,
      "step": 587
    },
    {
      "epoch": 0.22245341908635202,
      "grad_norm": 1.1397609148470536,
      "learning_rate": 5.88e-06,
      "loss": 0.8469,
      "step": 588
    },
    {
      "epoch": 0.22283174122765534,
      "grad_norm": 1.1785117139043815,
      "learning_rate": 5.8899999999999995e-06,
      "loss": 0.9147,
      "step": 589
    },
    {
      "epoch": 0.22321006336895866,
      "grad_norm": 1.1858125002539965,
      "learning_rate": 5.9e-06,
      "loss": 0.8849,
      "step": 590
    },
    {
      "epoch": 0.223588385510262,
      "grad_norm": 1.1941323389502188,
      "learning_rate": 5.91e-06,
      "loss": 0.869,
      "step": 591
    },
    {
      "epoch": 0.22396670765156532,
      "grad_norm": 1.1418623190210022,
      "learning_rate": 5.92e-06,
      "loss": 0.8308,
      "step": 592
    },
    {
      "epoch": 0.22434502979286863,
      "grad_norm": 1.0743417979986591,
      "learning_rate": 5.929999999999999e-06,
      "loss": 0.843,
      "step": 593
    },
    {
      "epoch": 0.22472335193417195,
      "grad_norm": 1.1529208818856194,
      "learning_rate": 5.94e-06,
      "loss": 0.8235,
      "step": 594
    },
    {
      "epoch": 0.22510167407547527,
      "grad_norm": 1.0767273225154363,
      "learning_rate": 5.95e-06,
      "loss": 0.8247,
      "step": 595
    },
    {
      "epoch": 0.22547999621677858,
      "grad_norm": 1.1070019054712885,
      "learning_rate": 5.96e-06,
      "loss": 0.8426,
      "step": 596
    },
    {
      "epoch": 0.2258583183580819,
      "grad_norm": 1.166373551635366,
      "learning_rate": 5.97e-06,
      "loss": 0.8732,
      "step": 597
    },
    {
      "epoch": 0.22623664049938522,
      "grad_norm": 1.123857925375413,
      "learning_rate": 5.98e-06,
      "loss": 0.8464,
      "step": 598
    },
    {
      "epoch": 0.22661496264068853,
      "grad_norm": 1.08557960856811,
      "learning_rate": 5.99e-06,
      "loss": 0.821,
      "step": 599
    },
    {
      "epoch": 0.22699328478199188,
      "grad_norm": 1.1164890662505647,
      "learning_rate": 6e-06,
      "loss": 0.8846,
      "step": 600
    },
    {
      "epoch": 0.2273716069232952,
      "grad_norm": 1.1514037573784872,
      "learning_rate": 6.009999999999999e-06,
      "loss": 0.8552,
      "step": 601
    },
    {
      "epoch": 0.2277499290645985,
      "grad_norm": 1.1511174146769416,
      "learning_rate": 6.019999999999999e-06,
      "loss": 0.9014,
      "step": 602
    },
    {
      "epoch": 0.22812825120590183,
      "grad_norm": 1.1696423261594386,
      "learning_rate": 6.03e-06,
      "loss": 0.8605,
      "step": 603
    },
    {
      "epoch": 0.22850657334720514,
      "grad_norm": 1.1207706559785515,
      "learning_rate": 6.04e-06,
      "loss": 0.8382,
      "step": 604
    },
    {
      "epoch": 0.22888489548850846,
      "grad_norm": 1.1767521633404514,
      "learning_rate": 6.05e-06,
      "loss": 0.9206,
      "step": 605
    },
    {
      "epoch": 0.22926321762981178,
      "grad_norm": 1.1758374604143937,
      "learning_rate": 6.06e-06,
      "loss": 0.8883,
      "step": 606
    },
    {
      "epoch": 0.2296415397711151,
      "grad_norm": 1.148791521470335,
      "learning_rate": 6.07e-06,
      "loss": 0.9091,
      "step": 607
    },
    {
      "epoch": 0.2300198619124184,
      "grad_norm": 1.1533752302256568,
      "learning_rate": 6.079999999999999e-06,
      "loss": 0.915,
      "step": 608
    },
    {
      "epoch": 0.23039818405372176,
      "grad_norm": 1.1082862913426186,
      "learning_rate": 6.089999999999999e-06,
      "loss": 0.8259,
      "step": 609
    },
    {
      "epoch": 0.23077650619502507,
      "grad_norm": 1.1400168808816862,
      "learning_rate": 6.099999999999999e-06,
      "loss": 0.8417,
      "step": 610
    },
    {
      "epoch": 0.2311548283363284,
      "grad_norm": 1.149922499835282,
      "learning_rate": 6.11e-06,
      "loss": 0.8736,
      "step": 611
    },
    {
      "epoch": 0.2315331504776317,
      "grad_norm": 1.1611344187938348,
      "learning_rate": 6.12e-06,
      "loss": 0.8376,
      "step": 612
    },
    {
      "epoch": 0.23191147261893502,
      "grad_norm": 1.1787603376828737,
      "learning_rate": 6.13e-06,
      "loss": 0.8558,
      "step": 613
    },
    {
      "epoch": 0.23228979476023834,
      "grad_norm": 1.155525289243939,
      "learning_rate": 6.14e-06,
      "loss": 0.8463,
      "step": 614
    },
    {
      "epoch": 0.23266811690154166,
      "grad_norm": 1.1589832886045384,
      "learning_rate": 6.15e-06,
      "loss": 0.8182,
      "step": 615
    },
    {
      "epoch": 0.23304643904284497,
      "grad_norm": 1.1033596458549921,
      "learning_rate": 6.1599999999999995e-06,
      "loss": 0.8324,
      "step": 616
    },
    {
      "epoch": 0.23342476118414832,
      "grad_norm": 1.2358470403500466,
      "learning_rate": 6.169999999999999e-06,
      "loss": 0.8682,
      "step": 617
    },
    {
      "epoch": 0.23380308332545163,
      "grad_norm": 1.0984535537652391,
      "learning_rate": 6.179999999999999e-06,
      "loss": 0.8332,
      "step": 618
    },
    {
      "epoch": 0.23418140546675495,
      "grad_norm": 1.2128396124349823,
      "learning_rate": 6.19e-06,
      "loss": 0.8747,
      "step": 619
    },
    {
      "epoch": 0.23455972760805827,
      "grad_norm": 1.2275794235621071,
      "learning_rate": 6.2e-06,
      "loss": 0.8953,
      "step": 620
    },
    {
      "epoch": 0.23493804974936158,
      "grad_norm": 1.2542101409168016,
      "learning_rate": 6.21e-06,
      "loss": 0.8892,
      "step": 621
    },
    {
      "epoch": 0.2353163718906649,
      "grad_norm": 1.204474995156125,
      "learning_rate": 6.22e-06,
      "loss": 0.8491,
      "step": 622
    },
    {
      "epoch": 0.23569469403196822,
      "grad_norm": 1.1548886283677673,
      "learning_rate": 6.2300000000000005e-06,
      "loss": 0.8581,
      "step": 623
    },
    {
      "epoch": 0.23607301617327153,
      "grad_norm": 1.251297532099902,
      "learning_rate": 6.2399999999999995e-06,
      "loss": 0.851,
      "step": 624
    },
    {
      "epoch": 0.23645133831457485,
      "grad_norm": 1.218716341983368,
      "learning_rate": 6.2499999999999995e-06,
      "loss": 0.917,
      "step": 625
    },
    {
      "epoch": 0.2368296604558782,
      "grad_norm": 1.1845662251647084,
      "learning_rate": 6.259999999999999e-06,
      "loss": 0.9132,
      "step": 626
    },
    {
      "epoch": 0.2372079825971815,
      "grad_norm": 1.1620810200029381,
      "learning_rate": 6.269999999999999e-06,
      "loss": 0.8652,
      "step": 627
    },
    {
      "epoch": 0.23758630473848483,
      "grad_norm": 1.1563059559969693,
      "learning_rate": 6.28e-06,
      "loss": 0.8474,
      "step": 628
    },
    {
      "epoch": 0.23796462687978814,
      "grad_norm": 1.1388389502769878,
      "learning_rate": 6.29e-06,
      "loss": 0.8314,
      "step": 629
    },
    {
      "epoch": 0.23834294902109146,
      "grad_norm": 1.1551456623854715,
      "learning_rate": 6.3e-06,
      "loss": 0.8902,
      "step": 630
    },
    {
      "epoch": 0.23872127116239478,
      "grad_norm": 1.1459750574525491,
      "learning_rate": 6.31e-06,
      "loss": 0.8505,
      "step": 631
    },
    {
      "epoch": 0.2390995933036981,
      "grad_norm": 1.0925608036319805,
      "learning_rate": 6.32e-06,
      "loss": 0.8651,
      "step": 632
    },
    {
      "epoch": 0.2394779154450014,
      "grad_norm": 1.1607966985031983,
      "learning_rate": 6.3299999999999995e-06,
      "loss": 0.8156,
      "step": 633
    },
    {
      "epoch": 0.23985623758630473,
      "grad_norm": 1.112649862871437,
      "learning_rate": 6.3399999999999994e-06,
      "loss": 0.823,
      "step": 634
    },
    {
      "epoch": 0.24023455972760807,
      "grad_norm": 1.1213541389814015,
      "learning_rate": 6.349999999999999e-06,
      "loss": 0.8397,
      "step": 635
    },
    {
      "epoch": 0.2406128818689114,
      "grad_norm": 1.134629038613528,
      "learning_rate": 6.36e-06,
      "loss": 0.8503,
      "step": 636
    },
    {
      "epoch": 0.2409912040102147,
      "grad_norm": 1.1342734785655144,
      "learning_rate": 6.37e-06,
      "loss": 0.8497,
      "step": 637
    },
    {
      "epoch": 0.24136952615151802,
      "grad_norm": 1.1277526276470056,
      "learning_rate": 6.38e-06,
      "loss": 0.8348,
      "step": 638
    },
    {
      "epoch": 0.24174784829282134,
      "grad_norm": 1.1313262215365258,
      "learning_rate": 6.39e-06,
      "loss": 0.8746,
      "step": 639
    },
    {
      "epoch": 0.24212617043412465,
      "grad_norm": 1.0984126709233168,
      "learning_rate": 6.4e-06,
      "loss": 0.8296,
      "step": 640
    },
    {
      "epoch": 0.24250449257542797,
      "grad_norm": 1.0888784783993595,
      "learning_rate": 6.41e-06,
      "loss": 0.8129,
      "step": 641
    },
    {
      "epoch": 0.2428828147167313,
      "grad_norm": 1.1461818324642985,
      "learning_rate": 6.4199999999999995e-06,
      "loss": 0.8834,
      "step": 642
    },
    {
      "epoch": 0.2432611368580346,
      "grad_norm": 1.1427506153934843,
      "learning_rate": 6.429999999999999e-06,
      "loss": 0.8706,
      "step": 643
    },
    {
      "epoch": 0.24363945899933795,
      "grad_norm": 1.144102199065487,
      "learning_rate": 6.44e-06,
      "loss": 0.8877,
      "step": 644
    },
    {
      "epoch": 0.24401778114064127,
      "grad_norm": 1.1231424595451174,
      "learning_rate": 6.45e-06,
      "loss": 0.8939,
      "step": 645
    },
    {
      "epoch": 0.24439610328194458,
      "grad_norm": 1.1218026132749124,
      "learning_rate": 6.46e-06,
      "loss": 0.8366,
      "step": 646
    },
    {
      "epoch": 0.2447744254232479,
      "grad_norm": 1.2086540508049943,
      "learning_rate": 6.469999999999999e-06,
      "loss": 0.892,
      "step": 647
    },
    {
      "epoch": 0.24515274756455122,
      "grad_norm": 1.0868363589750187,
      "learning_rate": 6.48e-06,
      "loss": 0.8581,
      "step": 648
    },
    {
      "epoch": 0.24553106970585453,
      "grad_norm": 1.1504181380058272,
      "learning_rate": 6.49e-06,
      "loss": 0.8942,
      "step": 649
    },
    {
      "epoch": 0.24590939184715785,
      "grad_norm": 1.1874832509790985,
      "learning_rate": 6.5e-06,
      "loss": 0.8379,
      "step": 650
    },
    {
      "epoch": 0.24628771398846117,
      "grad_norm": 1.1066886977698138,
      "learning_rate": 6.5099999999999995e-06,
      "loss": 0.8645,
      "step": 651
    },
    {
      "epoch": 0.24666603612976448,
      "grad_norm": 1.1091171121306154,
      "learning_rate": 6.519999999999999e-06,
      "loss": 0.8866,
      "step": 652
    },
    {
      "epoch": 0.24704435827106783,
      "grad_norm": 1.1168392333785764,
      "learning_rate": 6.53e-06,
      "loss": 0.8377,
      "step": 653
    },
    {
      "epoch": 0.24742268041237114,
      "grad_norm": 1.1333024723334617,
      "learning_rate": 6.54e-06,
      "loss": 0.8404,
      "step": 654
    },
    {
      "epoch": 0.24780100255367446,
      "grad_norm": 1.1624311607412376,
      "learning_rate": 6.549999999999999e-06,
      "loss": 0.8578,
      "step": 655
    },
    {
      "epoch": 0.24817932469497778,
      "grad_norm": 1.140510520926876,
      "learning_rate": 6.559999999999999e-06,
      "loss": 0.7948,
      "step": 656
    },
    {
      "epoch": 0.2485576468362811,
      "grad_norm": 1.1241297695775005,
      "learning_rate": 6.57e-06,
      "loss": 0.8455,
      "step": 657
    },
    {
      "epoch": 0.2489359689775844,
      "grad_norm": 1.1171688585786779,
      "learning_rate": 6.58e-06,
      "loss": 0.8347,
      "step": 658
    },
    {
      "epoch": 0.24931429111888773,
      "grad_norm": 1.131716974118065,
      "learning_rate": 6.59e-06,
      "loss": 0.8624,
      "step": 659
    },
    {
      "epoch": 0.24969261326019104,
      "grad_norm": 1.1586113355227856,
      "learning_rate": 6.5999999999999995e-06,
      "loss": 0.8937,
      "step": 660
    },
    {
      "epoch": 0.2500709354014944,
      "grad_norm": 1.186938370866149,
      "learning_rate": 6.61e-06,
      "loss": 0.8523,
      "step": 661
    },
    {
      "epoch": 0.2504492575427977,
      "grad_norm": 1.1500652838613878,
      "learning_rate": 6.62e-06,
      "loss": 0.8537,
      "step": 662
    },
    {
      "epoch": 0.250827579684101,
      "grad_norm": 1.2121811392488833,
      "learning_rate": 6.629999999999999e-06,
      "loss": 0.8477,
      "step": 663
    },
    {
      "epoch": 0.2512059018254043,
      "grad_norm": 1.1348675624901883,
      "learning_rate": 6.639999999999999e-06,
      "loss": 0.8502,
      "step": 664
    },
    {
      "epoch": 0.25158422396670765,
      "grad_norm": 1.102535269461347,
      "learning_rate": 6.65e-06,
      "loss": 0.8745,
      "step": 665
    },
    {
      "epoch": 0.25158422396670765,
      "eval_loss": 0.8625780940055847,
      "eval_runtime": 27.0021,
      "eval_samples_per_second": 32.775,
      "eval_steps_per_second": 1.037,
      "step": 665
    },
    {
      "epoch": 0.25158422396670765,
      "eval_bench_accuracy_arc_challenge": 0.24285714285714285,
      "eval_bench_accuracy_hellaswag": 0.225,
      "eval_bench_accuracy_mmlu": 0.2782608695652174,
      "eval_bench_average_accuracy": 0.24870600414078672,
      "eval_bench_loss": 5.327823571991503,
      "eval_bench_total_accuracy": 0.24395604395604395,
      "step": 665
    },
    {
      "epoch": 0.251962546108011,
      "grad_norm": 1.149499114356956,
      "learning_rate": 6.66e-06,
      "loss": 0.8693,
      "step": 666
    },
    {
      "epoch": 0.2523408682493143,
      "grad_norm": 1.161075438749712,
      "learning_rate": 6.67e-06,
      "loss": 0.9075,
      "step": 667
    },
    {
      "epoch": 0.25271919039061763,
      "grad_norm": 1.141541764628487,
      "learning_rate": 6.6799999999999996e-06,
      "loss": 0.8643,
      "step": 668
    },
    {
      "epoch": 0.2530975125319209,
      "grad_norm": 1.1390764097501647,
      "learning_rate": 6.69e-06,
      "loss": 0.8752,
      "step": 669
    },
    {
      "epoch": 0.25347583467322427,
      "grad_norm": 1.1198865085900025,
      "learning_rate": 6.7e-06,
      "loss": 0.8403,
      "step": 670
    },
    {
      "epoch": 0.25385415681452755,
      "grad_norm": 1.143235453200182,
      "learning_rate": 6.709999999999999e-06,
      "loss": 0.8347,
      "step": 671
    },
    {
      "epoch": 0.2542324789558309,
      "grad_norm": 1.105054342960603,
      "learning_rate": 6.719999999999999e-06,
      "loss": 0.877,
      "step": 672
    },
    {
      "epoch": 0.2546108010971342,
      "grad_norm": 1.1899413861555337,
      "learning_rate": 6.73e-06,
      "loss": 0.8239,
      "step": 673
    },
    {
      "epoch": 0.25498912323843753,
      "grad_norm": 1.1305008415556128,
      "learning_rate": 6.74e-06,
      "loss": 0.8598,
      "step": 674
    },
    {
      "epoch": 0.2553674453797409,
      "grad_norm": 1.168034799536073,
      "learning_rate": 6.75e-06,
      "loss": 0.8294,
      "step": 675
    },
    {
      "epoch": 0.25574576752104416,
      "grad_norm": 1.1472097884900647,
      "learning_rate": 6.76e-06,
      "loss": 0.9007,
      "step": 676
    },
    {
      "epoch": 0.2561240896623475,
      "grad_norm": 1.0931411919432397,
      "learning_rate": 6.7699999999999996e-06,
      "loss": 0.8326,
      "step": 677
    },
    {
      "epoch": 0.2565024118036508,
      "grad_norm": 1.1510688024969498,
      "learning_rate": 6.78e-06,
      "loss": 0.8828,
      "step": 678
    },
    {
      "epoch": 0.25688073394495414,
      "grad_norm": 1.1191461068866526,
      "learning_rate": 6.789999999999999e-06,
      "loss": 0.8461,
      "step": 679
    },
    {
      "epoch": 0.25725905608625743,
      "grad_norm": 1.1041404496614182,
      "learning_rate": 6.799999999999999e-06,
      "loss": 0.8285,
      "step": 680
    },
    {
      "epoch": 0.2576373782275608,
      "grad_norm": 1.1012877673575499,
      "learning_rate": 6.809999999999999e-06,
      "loss": 0.8548,
      "step": 681
    },
    {
      "epoch": 0.25801570036886406,
      "grad_norm": 1.1057501522176822,
      "learning_rate": 6.82e-06,
      "loss": 0.8591,
      "step": 682
    },
    {
      "epoch": 0.2583940225101674,
      "grad_norm": 1.1498742481849225,
      "learning_rate": 6.83e-06,
      "loss": 0.8661,
      "step": 683
    },
    {
      "epoch": 0.25877234465147075,
      "grad_norm": 1.1378178315852814,
      "learning_rate": 6.84e-06,
      "loss": 0.8759,
      "step": 684
    },
    {
      "epoch": 0.25915066679277404,
      "grad_norm": 1.1011069671017035,
      "learning_rate": 6.85e-06,
      "loss": 0.823,
      "step": 685
    },
    {
      "epoch": 0.2595289889340774,
      "grad_norm": 1.160807734407358,
      "learning_rate": 6.86e-06,
      "loss": 0.8732,
      "step": 686
    },
    {
      "epoch": 0.2599073110753807,
      "grad_norm": 1.0867868118261128,
      "learning_rate": 6.8699999999999994e-06,
      "loss": 0.8367,
      "step": 687
    },
    {
      "epoch": 0.260285633216684,
      "grad_norm": 1.0969221739263768,
      "learning_rate": 6.879999999999999e-06,
      "loss": 0.8647,
      "step": 688
    },
    {
      "epoch": 0.2606639553579873,
      "grad_norm": 1.0995292401504533,
      "learning_rate": 6.889999999999999e-06,
      "loss": 0.8524,
      "step": 689
    },
    {
      "epoch": 0.26104227749929065,
      "grad_norm": 1.1692507904848903,
      "learning_rate": 6.9e-06,
      "loss": 0.8519,
      "step": 690
    },
    {
      "epoch": 0.26142059964059394,
      "grad_norm": 1.0998400071794445,
      "learning_rate": 6.91e-06,
      "loss": 0.8287,
      "step": 691
    },
    {
      "epoch": 0.2617989217818973,
      "grad_norm": 1.1968950530047644,
      "learning_rate": 6.92e-06,
      "loss": 0.8138,
      "step": 692
    },
    {
      "epoch": 0.26217724392320063,
      "grad_norm": 1.095854905073934,
      "learning_rate": 6.93e-06,
      "loss": 0.8568,
      "step": 693
    },
    {
      "epoch": 0.2625555660645039,
      "grad_norm": 1.1079273378796317,
      "learning_rate": 6.9400000000000005e-06,
      "loss": 0.8353,
      "step": 694
    },
    {
      "epoch": 0.26293388820580726,
      "grad_norm": 1.1606191819435765,
      "learning_rate": 6.9499999999999995e-06,
      "loss": 0.8561,
      "step": 695
    },
    {
      "epoch": 0.26331221034711055,
      "grad_norm": 1.0902425837878627,
      "learning_rate": 6.9599999999999994e-06,
      "loss": 0.8391,
      "step": 696
    },
    {
      "epoch": 0.2636905324884139,
      "grad_norm": 1.1206727493642596,
      "learning_rate": 6.969999999999999e-06,
      "loss": 0.8233,
      "step": 697
    },
    {
      "epoch": 0.2640688546297172,
      "grad_norm": 1.0982647837307586,
      "learning_rate": 6.98e-06,
      "loss": 0.8602,
      "step": 698
    },
    {
      "epoch": 0.26444717677102053,
      "grad_norm": 1.0871328583668558,
      "learning_rate": 6.99e-06,
      "loss": 0.8299,
      "step": 699
    },
    {
      "epoch": 0.2648254989123238,
      "grad_norm": 1.1008815238203256,
      "learning_rate": 7e-06,
      "loss": 0.8341,
      "step": 700
    },
    {
      "epoch": 0.26520382105362716,
      "grad_norm": 1.1750095526723472,
      "learning_rate": 7.01e-06,
      "loss": 0.8682,
      "step": 701
    },
    {
      "epoch": 0.2655821431949305,
      "grad_norm": 1.1415931541767914,
      "learning_rate": 7.019999999999999e-06,
      "loss": 0.8932,
      "step": 702
    },
    {
      "epoch": 0.2659604653362338,
      "grad_norm": 1.0981715817655127,
      "learning_rate": 7.03e-06,
      "loss": 0.838,
      "step": 703
    },
    {
      "epoch": 0.26633878747753714,
      "grad_norm": 1.0986067356062597,
      "learning_rate": 7.0399999999999995e-06,
      "loss": 0.8503,
      "step": 704
    },
    {
      "epoch": 0.26671710961884043,
      "grad_norm": 1.1084347528867848,
      "learning_rate": 7.049999999999999e-06,
      "loss": 0.8958,
      "step": 705
    },
    {
      "epoch": 0.2670954317601438,
      "grad_norm": 1.1475294765378516,
      "learning_rate": 7.059999999999999e-06,
      "loss": 0.8496,
      "step": 706
    },
    {
      "epoch": 0.26747375390144706,
      "grad_norm": 1.117143691203432,
      "learning_rate": 7.07e-06,
      "loss": 0.875,
      "step": 707
    },
    {
      "epoch": 0.2678520760427504,
      "grad_norm": 1.1331250955748378,
      "learning_rate": 7.08e-06,
      "loss": 0.854,
      "step": 708
    },
    {
      "epoch": 0.2682303981840537,
      "grad_norm": 1.0837995640069416,
      "learning_rate": 7.09e-06,
      "loss": 0.8461,
      "step": 709
    },
    {
      "epoch": 0.26860872032535704,
      "grad_norm": 1.0933867992273585,
      "learning_rate": 7.099999999999999e-06,
      "loss": 0.8383,
      "step": 710
    },
    {
      "epoch": 0.2689870424666604,
      "grad_norm": 1.0862191237112888,
      "learning_rate": 7.11e-06,
      "loss": 0.7976,
      "step": 711
    },
    {
      "epoch": 0.2693653646079637,
      "grad_norm": 1.1151836826262986,
      "learning_rate": 7.12e-06,
      "loss": 0.8224,
      "step": 712
    },
    {
      "epoch": 0.269743686749267,
      "grad_norm": 1.189062828656012,
      "learning_rate": 7.1299999999999995e-06,
      "loss": 0.8917,
      "step": 713
    },
    {
      "epoch": 0.2701220088905703,
      "grad_norm": 1.1119181389921133,
      "learning_rate": 7.139999999999999e-06,
      "loss": 0.8291,
      "step": 714
    },
    {
      "epoch": 0.27050033103187365,
      "grad_norm": 1.114538144475484,
      "learning_rate": 7.15e-06,
      "loss": 0.8996,
      "step": 715
    },
    {
      "epoch": 0.27087865317317694,
      "grad_norm": 1.1005437857491667,
      "learning_rate": 7.16e-06,
      "loss": 0.7888,
      "step": 716
    },
    {
      "epoch": 0.2712569753144803,
      "grad_norm": 1.1146994809955666,
      "learning_rate": 7.17e-06,
      "loss": 0.8878,
      "step": 717
    },
    {
      "epoch": 0.2716352974557836,
      "grad_norm": 1.0936279250904897,
      "learning_rate": 7.179999999999999e-06,
      "loss": 0.8672,
      "step": 718
    },
    {
      "epoch": 0.2720136195970869,
      "grad_norm": 1.1366251894998205,
      "learning_rate": 7.19e-06,
      "loss": 0.8858,
      "step": 719
    },
    {
      "epoch": 0.27239194173839026,
      "grad_norm": 1.1195931324613553,
      "learning_rate": 7.2e-06,
      "loss": 0.8507,
      "step": 720
    },
    {
      "epoch": 0.27277026387969355,
      "grad_norm": 1.0935327911384591,
      "learning_rate": 7.21e-06,
      "loss": 0.8424,
      "step": 721
    },
    {
      "epoch": 0.2731485860209969,
      "grad_norm": 1.0953372322434138,
      "learning_rate": 7.2199999999999995e-06,
      "loss": 0.8831,
      "step": 722
    },
    {
      "epoch": 0.2735269081623002,
      "grad_norm": 1.0904032768722667,
      "learning_rate": 7.23e-06,
      "loss": 0.8334,
      "step": 723
    },
    {
      "epoch": 0.27390523030360353,
      "grad_norm": 1.1346874176897102,
      "learning_rate": 7.24e-06,
      "loss": 0.8506,
      "step": 724
    },
    {
      "epoch": 0.2742835524449068,
      "grad_norm": 1.154262444900059,
      "learning_rate": 7.25e-06,
      "loss": 0.8393,
      "step": 725
    },
    {
      "epoch": 0.27466187458621016,
      "grad_norm": 1.1336981217637951,
      "learning_rate": 7.259999999999999e-06,
      "loss": 0.8371,
      "step": 726
    },
    {
      "epoch": 0.27504019672751345,
      "grad_norm": 1.1530922109530841,
      "learning_rate": 7.269999999999999e-06,
      "loss": 0.9141,
      "step": 727
    },
    {
      "epoch": 0.2754185188688168,
      "grad_norm": 1.1414400257725132,
      "learning_rate": 7.28e-06,
      "loss": 0.8615,
      "step": 728
    },
    {
      "epoch": 0.27579684101012014,
      "grad_norm": 1.0747602134856014,
      "learning_rate": 7.29e-06,
      "loss": 0.8507,
      "step": 729
    },
    {
      "epoch": 0.27617516315142343,
      "grad_norm": 1.1341332656767107,
      "learning_rate": 7.2999999999999996e-06,
      "loss": 0.8771,
      "step": 730
    },
    {
      "epoch": 0.2765534852927268,
      "grad_norm": 1.127774756748704,
      "learning_rate": 7.3099999999999995e-06,
      "loss": 0.8559,
      "step": 731
    },
    {
      "epoch": 0.27693180743403006,
      "grad_norm": 1.106246473020497,
      "learning_rate": 7.32e-06,
      "loss": 0.8333,
      "step": 732
    },
    {
      "epoch": 0.2773101295753334,
      "grad_norm": 1.072619886572064,
      "learning_rate": 7.33e-06,
      "loss": 0.8138,
      "step": 733
    },
    {
      "epoch": 0.2776884517166367,
      "grad_norm": 1.1053237591292755,
      "learning_rate": 7.339999999999999e-06,
      "loss": 0.8929,
      "step": 734
    },
    {
      "epoch": 0.27806677385794004,
      "grad_norm": 1.0590657569440343,
      "learning_rate": 7.349999999999999e-06,
      "loss": 0.8657,
      "step": 735
    },
    {
      "epoch": 0.27844509599924333,
      "grad_norm": 1.0990511323540157,
      "learning_rate": 7.36e-06,
      "loss": 0.831,
      "step": 736
    },
    {
      "epoch": 0.2788234181405467,
      "grad_norm": 1.0960494967933392,
      "learning_rate": 7.37e-06,
      "loss": 0.8672,
      "step": 737
    },
    {
      "epoch": 0.27920174028185,
      "grad_norm": 1.0923972930315522,
      "learning_rate": 7.38e-06,
      "loss": 0.8359,
      "step": 738
    },
    {
      "epoch": 0.2795800624231533,
      "grad_norm": 1.117398170352597,
      "learning_rate": 7.3899999999999995e-06,
      "loss": 0.8678,
      "step": 739
    },
    {
      "epoch": 0.27995838456445665,
      "grad_norm": 1.0964334876514574,
      "learning_rate": 7.4e-06,
      "loss": 0.8175,
      "step": 740
    },
    {
      "epoch": 0.28033670670575994,
      "grad_norm": 1.137429209179925,
      "learning_rate": 7.41e-06,
      "loss": 0.8469,
      "step": 741
    },
    {
      "epoch": 0.2807150288470633,
      "grad_norm": 1.1550309848051612,
      "learning_rate": 7.419999999999999e-06,
      "loss": 0.8326,
      "step": 742
    },
    {
      "epoch": 0.2810933509883666,
      "grad_norm": 1.1935237789558146,
      "learning_rate": 7.429999999999999e-06,
      "loss": 0.8568,
      "step": 743
    },
    {
      "epoch": 0.2814716731296699,
      "grad_norm": 1.1694982973025607,
      "learning_rate": 7.44e-06,
      "loss": 0.8869,
      "step": 744
    },
    {
      "epoch": 0.2818499952709732,
      "grad_norm": 1.1920139094347593,
      "learning_rate": 7.45e-06,
      "loss": 0.8487,
      "step": 745
    },
    {
      "epoch": 0.28222831741227655,
      "grad_norm": 1.1367845567285337,
      "learning_rate": 7.46e-06,
      "loss": 0.8554,
      "step": 746
    },
    {
      "epoch": 0.2826066395535799,
      "grad_norm": 1.1505063717374056,
      "learning_rate": 7.47e-06,
      "loss": 0.8371,
      "step": 747
    },
    {
      "epoch": 0.2829849616948832,
      "grad_norm": 1.1339987287473563,
      "learning_rate": 7.48e-06,
      "loss": 0.8256,
      "step": 748
    },
    {
      "epoch": 0.28336328383618653,
      "grad_norm": 1.158977003616627,
      "learning_rate": 7.49e-06,
      "loss": 0.8913,
      "step": 749
    },
    {
      "epoch": 0.2837416059774898,
      "grad_norm": 1.1022707433616572,
      "learning_rate": 7.499999999999999e-06,
      "loss": 0.8117,
      "step": 750
    },
    {
      "epoch": 0.28411992811879316,
      "grad_norm": 1.1550634309139105,
      "learning_rate": 7.509999999999999e-06,
      "loss": 0.8906,
      "step": 751
    },
    {
      "epoch": 0.28449825026009645,
      "grad_norm": 1.090317910646282,
      "learning_rate": 7.519999999999999e-06,
      "loss": 0.8799,
      "step": 752
    },
    {
      "epoch": 0.2848765724013998,
      "grad_norm": 1.0677643984555838,
      "learning_rate": 7.53e-06,
      "loss": 0.8653,
      "step": 753
    },
    {
      "epoch": 0.2852548945427031,
      "grad_norm": 1.1663544994037678,
      "learning_rate": 7.54e-06,
      "loss": 0.8737,
      "step": 754
    },
    {
      "epoch": 0.28563321668400643,
      "grad_norm": 1.0973153975053445,
      "learning_rate": 7.55e-06,
      "loss": 0.8485,
      "step": 755
    },
    {
      "epoch": 0.2860115388253098,
      "grad_norm": 1.0761549351444184,
      "learning_rate": 7.56e-06,
      "loss": 0.8284,
      "step": 756
    },
    {
      "epoch": 0.28638986096661306,
      "grad_norm": 1.1355050591654032,
      "learning_rate": 7.5699999999999995e-06,
      "loss": 0.8348,
      "step": 757
    },
    {
      "epoch": 0.2867681831079164,
      "grad_norm": 1.116699730612722,
      "learning_rate": 7.5799999999999994e-06,
      "loss": 0.8405,
      "step": 758
    },
    {
      "epoch": 0.2871465052492197,
      "grad_norm": 1.1037588379626753,
      "learning_rate": 7.589999999999999e-06,
      "loss": 0.8652,
      "step": 759
    },
    {
      "epoch": 0.28752482739052304,
      "grad_norm": 1.092569661781677,
      "learning_rate": 7.599999999999999e-06,
      "loss": 0.8786,
      "step": 760
    },
    {
      "epoch": 0.28790314953182633,
      "grad_norm": 1.1079207038423997,
      "learning_rate": 7.61e-06,
      "loss": 0.8731,
      "step": 761
    },
    {
      "epoch": 0.2882814716731297,
      "grad_norm": 1.0840455559100046,
      "learning_rate": 7.62e-06,
      "loss": 0.8533,
      "step": 762
    },
    {
      "epoch": 0.28865979381443296,
      "grad_norm": 1.1088308729059055,
      "learning_rate": 7.63e-06,
      "loss": 0.8407,
      "step": 763
    },
    {
      "epoch": 0.2890381159557363,
      "grad_norm": 1.070788168887275,
      "learning_rate": 7.64e-06,
      "loss": 0.8919,
      "step": 764
    },
    {
      "epoch": 0.28941643809703965,
      "grad_norm": 1.060969292922543,
      "learning_rate": 7.65e-06,
      "loss": 0.812,
      "step": 765
    },
    {
      "epoch": 0.28979476023834294,
      "grad_norm": 1.1301219505514637,
      "learning_rate": 7.66e-06,
      "loss": 0.8336,
      "step": 766
    },
    {
      "epoch": 0.2901730823796463,
      "grad_norm": 1.0534794694384884,
      "learning_rate": 7.67e-06,
      "loss": 0.8329,
      "step": 767
    },
    {
      "epoch": 0.2905514045209496,
      "grad_norm": 1.1347313685498166,
      "learning_rate": 7.68e-06,
      "loss": 0.8793,
      "step": 768
    },
    {
      "epoch": 0.2909297266622529,
      "grad_norm": 1.1475444842715925,
      "learning_rate": 7.69e-06,
      "loss": 0.8508,
      "step": 769
    },
    {
      "epoch": 0.2913080488035562,
      "grad_norm": 1.131952349011137,
      "learning_rate": 7.699999999999999e-06,
      "loss": 0.845,
      "step": 770
    },
    {
      "epoch": 0.29168637094485955,
      "grad_norm": 1.1447781586459667,
      "learning_rate": 7.709999999999999e-06,
      "loss": 0.8726,
      "step": 771
    },
    {
      "epoch": 0.29206469308616284,
      "grad_norm": 1.1327583004535982,
      "learning_rate": 7.719999999999999e-06,
      "loss": 0.8104,
      "step": 772
    },
    {
      "epoch": 0.2924430152274662,
      "grad_norm": 1.128617220703407,
      "learning_rate": 7.73e-06,
      "loss": 0.8176,
      "step": 773
    },
    {
      "epoch": 0.29282133736876953,
      "grad_norm": 1.1023174787003673,
      "learning_rate": 7.74e-06,
      "loss": 0.8428,
      "step": 774
    },
    {
      "epoch": 0.2931996595100728,
      "grad_norm": 1.1676360521088707,
      "learning_rate": 7.75e-06,
      "loss": 0.8811,
      "step": 775
    },
    {
      "epoch": 0.29357798165137616,
      "grad_norm": 1.1926785192763554,
      "learning_rate": 7.76e-06,
      "loss": 0.8814,
      "step": 776
    },
    {
      "epoch": 0.29395630379267945,
      "grad_norm": 1.0926242154672956,
      "learning_rate": 7.769999999999998e-06,
      "loss": 0.8697,
      "step": 777
    },
    {
      "epoch": 0.2943346259339828,
      "grad_norm": 1.1477061183634145,
      "learning_rate": 7.78e-06,
      "loss": 0.883,
      "step": 778
    },
    {
      "epoch": 0.2947129480752861,
      "grad_norm": 1.0524242129666213,
      "learning_rate": 7.79e-06,
      "loss": 0.8285,
      "step": 779
    },
    {
      "epoch": 0.29509127021658943,
      "grad_norm": 1.1003220338231798,
      "learning_rate": 7.8e-06,
      "loss": 0.873,
      "step": 780
    },
    {
      "epoch": 0.2954695923578927,
      "grad_norm": 1.0924766297335016,
      "learning_rate": 7.81e-06,
      "loss": 0.8388,
      "step": 781
    },
    {
      "epoch": 0.29584791449919606,
      "grad_norm": 1.0905974324189436,
      "learning_rate": 7.82e-06,
      "loss": 0.8456,
      "step": 782
    },
    {
      "epoch": 0.2962262366404994,
      "grad_norm": 1.0784036223330382,
      "learning_rate": 7.83e-06,
      "loss": 0.8732,
      "step": 783
    },
    {
      "epoch": 0.2966045587818027,
      "grad_norm": 1.0471596415042548,
      "learning_rate": 7.84e-06,
      "loss": 0.8396,
      "step": 784
    },
    {
      "epoch": 0.29698288092310604,
      "grad_norm": 1.080443491875735,
      "learning_rate": 7.85e-06,
      "loss": 0.8458,
      "step": 785
    },
    {
      "epoch": 0.29736120306440933,
      "grad_norm": 1.0828576066417819,
      "learning_rate": 7.86e-06,
      "loss": 0.813,
      "step": 786
    },
    {
      "epoch": 0.2977395252057127,
      "grad_norm": 1.0752539748255008,
      "learning_rate": 7.87e-06,
      "loss": 0.8564,
      "step": 787
    },
    {
      "epoch": 0.29811784734701596,
      "grad_norm": 1.0994217833391198,
      "learning_rate": 7.879999999999999e-06,
      "loss": 0.8263,
      "step": 788
    },
    {
      "epoch": 0.2984961694883193,
      "grad_norm": 1.086381772786406,
      "learning_rate": 7.889999999999999e-06,
      "loss": 0.8563,
      "step": 789
    },
    {
      "epoch": 0.2988744916296226,
      "grad_norm": 1.1088374241291266,
      "learning_rate": 7.9e-06,
      "loss": 0.864,
      "step": 790
    },
    {
      "epoch": 0.29925281377092594,
      "grad_norm": 1.1571412075379082,
      "learning_rate": 7.91e-06,
      "loss": 0.8171,
      "step": 791
    },
    {
      "epoch": 0.2996311359122293,
      "grad_norm": 1.1203389931533279,
      "learning_rate": 7.92e-06,
      "loss": 0.8441,
      "step": 792
    },
    {
      "epoch": 0.3000094580535326,
      "grad_norm": 1.0955306189611171,
      "learning_rate": 7.929999999999999e-06,
      "loss": 0.8367,
      "step": 793
    },
    {
      "epoch": 0.3003877801948359,
      "grad_norm": 1.0518036198212661,
      "learning_rate": 7.94e-06,
      "loss": 0.8115,
      "step": 794
    },
    {
      "epoch": 0.3007661023361392,
      "grad_norm": 1.1024545203471212,
      "learning_rate": 7.95e-06,
      "loss": 0.8981,
      "step": 795
    },
    {
      "epoch": 0.30114442447744255,
      "grad_norm": 1.1408707488859684,
      "learning_rate": 7.96e-06,
      "loss": 0.8574,
      "step": 796
    },
    {
      "epoch": 0.30152274661874584,
      "grad_norm": 1.0664606162956756,
      "learning_rate": 7.97e-06,
      "loss": 0.851,
      "step": 797
    },
    {
      "epoch": 0.3019010687600492,
      "grad_norm": 1.1045392245613144,
      "learning_rate": 7.98e-06,
      "loss": 0.8472,
      "step": 798
    },
    {
      "epoch": 0.3019010687600492,
      "eval_loss": 0.850925862789154,
      "eval_runtime": 26.6744,
      "eval_samples_per_second": 33.178,
      "eval_steps_per_second": 1.05,
      "step": 798
    },
    {
      "epoch": 0.3019010687600492,
      "eval_bench_accuracy_arc_challenge": 0.21428571428571427,
      "eval_bench_accuracy_hellaswag": 0.235,
      "eval_bench_accuracy_mmlu": 0.28695652173913044,
      "eval_bench_average_accuracy": 0.24541407867494824,
      "eval_bench_loss": 4.9830322265625,
      "eval_bench_total_accuracy": 0.24175824175824176,
      "step": 798
    },
    {
      "epoch": 0.30227939090135253,
      "grad_norm": 1.1188259399602403,
      "learning_rate": 7.99e-06,
      "loss": 0.8468,
      "step": 799
    },
    {
      "epoch": 0.3026577130426558,
      "grad_norm": 1.1431484110606045,
      "learning_rate": 8e-06,
      "loss": 0.8401,
      "step": 800
    },
    {
      "epoch": 0.30303603518395916,
      "grad_norm": 1.083646592987573,
      "learning_rate": 7.999999611606006e-06,
      "loss": 0.8062,
      "step": 801
    },
    {
      "epoch": 0.30341435732526245,
      "grad_norm": 1.1319556143394125,
      "learning_rate": 7.999998446424103e-06,
      "loss": 0.8818,
      "step": 802
    },
    {
      "epoch": 0.3037926794665658,
      "grad_norm": 1.0994025822887656,
      "learning_rate": 7.999996504454512e-06,
      "loss": 0.8509,
      "step": 803
    },
    {
      "epoch": 0.3041710016078691,
      "grad_norm": 1.0755886346693961,
      "learning_rate": 7.999993785697617e-06,
      "loss": 0.8004,
      "step": 804
    },
    {
      "epoch": 0.30454932374917243,
      "grad_norm": 1.1441919264010905,
      "learning_rate": 7.99999029015394e-06,
      "loss": 0.808,
      "step": 805
    },
    {
      "epoch": 0.3049276458904757,
      "grad_norm": 1.1065610412104439,
      "learning_rate": 7.999986017824165e-06,
      "loss": 0.8549,
      "step": 806
    },
    {
      "epoch": 0.30530596803177906,
      "grad_norm": 1.0882701082696518,
      "learning_rate": 7.999980968709117e-06,
      "loss": 0.8468,
      "step": 807
    },
    {
      "epoch": 0.3056842901730824,
      "grad_norm": 1.1088124295992208,
      "learning_rate": 7.999975142809778e-06,
      "loss": 0.8736,
      "step": 808
    },
    {
      "epoch": 0.3060626123143857,
      "grad_norm": 1.1033663016693673,
      "learning_rate": 7.99996854012728e-06,
      "loss": 0.8476,
      "step": 809
    },
    {
      "epoch": 0.30644093445568904,
      "grad_norm": 1.13603689058083,
      "learning_rate": 7.999961160662905e-06,
      "loss": 0.8445,
      "step": 810
    },
    {
      "epoch": 0.30681925659699233,
      "grad_norm": 1.160741078547518,
      "learning_rate": 7.999953004418086e-06,
      "loss": 0.8858,
      "step": 811
    },
    {
      "epoch": 0.3071975787382957,
      "grad_norm": 1.1137885301105297,
      "learning_rate": 7.999944071394408e-06,
      "loss": 0.8468,
      "step": 812
    },
    {
      "epoch": 0.30757590087959896,
      "grad_norm": 1.0950922126362728,
      "learning_rate": 7.999934361593606e-06,
      "loss": 0.8277,
      "step": 813
    },
    {
      "epoch": 0.3079542230209023,
      "grad_norm": 1.0705498486629084,
      "learning_rate": 7.999923875017561e-06,
      "loss": 0.8542,
      "step": 814
    },
    {
      "epoch": 0.3083325451622056,
      "grad_norm": 1.0320443969916053,
      "learning_rate": 7.999912611668314e-06,
      "loss": 0.8311,
      "step": 815
    },
    {
      "epoch": 0.30871086730350894,
      "grad_norm": 1.1098560201406311,
      "learning_rate": 7.999900571548054e-06,
      "loss": 0.8285,
      "step": 816
    },
    {
      "epoch": 0.3090891894448123,
      "grad_norm": 1.117956788545042,
      "learning_rate": 7.999887754659112e-06,
      "loss": 0.8062,
      "step": 817
    },
    {
      "epoch": 0.3094675115861156,
      "grad_norm": 1.0815055115388574,
      "learning_rate": 7.999874161003984e-06,
      "loss": 0.825,
      "step": 818
    },
    {
      "epoch": 0.3098458337274189,
      "grad_norm": 1.1258610055051623,
      "learning_rate": 7.999859790585307e-06,
      "loss": 0.8544,
      "step": 819
    },
    {
      "epoch": 0.3102241558687222,
      "grad_norm": 1.0792203366803435,
      "learning_rate": 7.99984464340587e-06,
      "loss": 0.8371,
      "step": 820
    },
    {
      "epoch": 0.31060247801002555,
      "grad_norm": 1.0857066217255478,
      "learning_rate": 7.999828719468619e-06,
      "loss": 0.8025,
      "step": 821
    },
    {
      "epoch": 0.31098080015132884,
      "grad_norm": 1.0345681012946357,
      "learning_rate": 7.999812018776642e-06,
      "loss": 0.7961,
      "step": 822
    },
    {
      "epoch": 0.3113591222926322,
      "grad_norm": 1.0880871394519303,
      "learning_rate": 7.999794541333184e-06,
      "loss": 0.867,
      "step": 823
    },
    {
      "epoch": 0.3117374444339355,
      "grad_norm": 1.0734362647252,
      "learning_rate": 7.99977628714164e-06,
      "loss": 0.8504,
      "step": 824
    },
    {
      "epoch": 0.3121157665752388,
      "grad_norm": 1.0651195855212972,
      "learning_rate": 7.999757256205554e-06,
      "loss": 0.836,
      "step": 825
    },
    {
      "epoch": 0.31249408871654216,
      "grad_norm": 1.0952088927990486,
      "learning_rate": 7.99973744852862e-06,
      "loss": 0.8685,
      "step": 826
    },
    {
      "epoch": 0.31287241085784545,
      "grad_norm": 1.1189908995835645,
      "learning_rate": 7.999716864114687e-06,
      "loss": 0.8612,
      "step": 827
    },
    {
      "epoch": 0.3132507329991488,
      "grad_norm": 1.1107627441762915,
      "learning_rate": 7.999695502967753e-06,
      "loss": 0.887,
      "step": 828
    },
    {
      "epoch": 0.3136290551404521,
      "grad_norm": 1.0910830318775155,
      "learning_rate": 7.999673365091965e-06,
      "loss": 0.8149,
      "step": 829
    },
    {
      "epoch": 0.31400737728175543,
      "grad_norm": 1.0878738960197105,
      "learning_rate": 7.99965045049162e-06,
      "loss": 0.8543,
      "step": 830
    },
    {
      "epoch": 0.3143856994230587,
      "grad_norm": 1.1304840925957875,
      "learning_rate": 7.999626759171173e-06,
      "loss": 0.8607,
      "step": 831
    },
    {
      "epoch": 0.31476402156436206,
      "grad_norm": 1.0977832972523356,
      "learning_rate": 7.99960229113522e-06,
      "loss": 0.8238,
      "step": 832
    },
    {
      "epoch": 0.31514234370566535,
      "grad_norm": 1.1056029713906521,
      "learning_rate": 7.999577046388514e-06,
      "loss": 0.8449,
      "step": 833
    },
    {
      "epoch": 0.3155206658469687,
      "grad_norm": 1.1263279045653014,
      "learning_rate": 7.999551024935959e-06,
      "loss": 0.8996,
      "step": 834
    },
    {
      "epoch": 0.31589898798827204,
      "grad_norm": 1.1023495304424114,
      "learning_rate": 7.999524226782608e-06,
      "loss": 0.8059,
      "step": 835
    },
    {
      "epoch": 0.31627731012957533,
      "grad_norm": 1.0710753056086557,
      "learning_rate": 7.999496651933662e-06,
      "loss": 0.8364,
      "step": 836
    },
    {
      "epoch": 0.3166556322708787,
      "grad_norm": 1.1628408036471776,
      "learning_rate": 7.999468300394481e-06,
      "loss": 0.8491,
      "step": 837
    },
    {
      "epoch": 0.31703395441218196,
      "grad_norm": 1.1011205956685801,
      "learning_rate": 7.999439172170566e-06,
      "loss": 0.8371,
      "step": 838
    },
    {
      "epoch": 0.3174122765534853,
      "grad_norm": 1.067716374321139,
      "learning_rate": 7.999409267267577e-06,
      "loss": 0.8257,
      "step": 839
    },
    {
      "epoch": 0.3177905986947886,
      "grad_norm": 1.1358374860128349,
      "learning_rate": 7.99937858569132e-06,
      "loss": 0.8317,
      "step": 840
    },
    {
      "epoch": 0.31816892083609194,
      "grad_norm": 1.0779959631518108,
      "learning_rate": 7.999347127447752e-06,
      "loss": 0.7981,
      "step": 841
    },
    {
      "epoch": 0.31854724297739523,
      "grad_norm": 1.1254796876535107,
      "learning_rate": 7.999314892542985e-06,
      "loss": 0.8971,
      "step": 842
    },
    {
      "epoch": 0.3189255651186986,
      "grad_norm": 1.0901729922813403,
      "learning_rate": 7.999281880983277e-06,
      "loss": 0.8506,
      "step": 843
    },
    {
      "epoch": 0.3193038872600019,
      "grad_norm": 1.0709160400913234,
      "learning_rate": 7.999248092775039e-06,
      "loss": 0.8468,
      "step": 844
    },
    {
      "epoch": 0.3196822094013052,
      "grad_norm": 1.1223182444160262,
      "learning_rate": 7.999213527924831e-06,
      "loss": 0.8217,
      "step": 845
    },
    {
      "epoch": 0.32006053154260855,
      "grad_norm": 1.1033066311400137,
      "learning_rate": 7.99917818643937e-06,
      "loss": 0.8646,
      "step": 846
    },
    {
      "epoch": 0.32043885368391184,
      "grad_norm": 1.1122943393613496,
      "learning_rate": 7.999142068325514e-06,
      "loss": 0.8343,
      "step": 847
    },
    {
      "epoch": 0.3208171758252152,
      "grad_norm": 1.1197740571480894,
      "learning_rate": 7.999105173590281e-06,
      "loss": 0.8408,
      "step": 848
    },
    {
      "epoch": 0.3211954979665185,
      "grad_norm": 1.0680302459683109,
      "learning_rate": 7.999067502240835e-06,
      "loss": 0.8527,
      "step": 849
    },
    {
      "epoch": 0.3215738201078218,
      "grad_norm": 1.0872491602723373,
      "learning_rate": 7.99902905428449e-06,
      "loss": 0.8417,
      "step": 850
    },
    {
      "epoch": 0.3219521422491251,
      "grad_norm": 1.106663351318103,
      "learning_rate": 7.998989829728712e-06,
      "loss": 0.8055,
      "step": 851
    },
    {
      "epoch": 0.32233046439042845,
      "grad_norm": 1.0809694317490106,
      "learning_rate": 7.998949828581122e-06,
      "loss": 0.8614,
      "step": 852
    },
    {
      "epoch": 0.3227087865317318,
      "grad_norm": 1.102190346138006,
      "learning_rate": 7.998909050849484e-06,
      "loss": 0.8716,
      "step": 853
    },
    {
      "epoch": 0.3230871086730351,
      "grad_norm": 1.0436133036323463,
      "learning_rate": 7.998867496541719e-06,
      "loss": 0.8575,
      "step": 854
    },
    {
      "epoch": 0.32346543081433843,
      "grad_norm": 1.0545933388006492,
      "learning_rate": 7.998825165665894e-06,
      "loss": 0.8208,
      "step": 855
    },
    {
      "epoch": 0.3238437529556417,
      "grad_norm": 1.066597036199654,
      "learning_rate": 7.998782058230237e-06,
      "loss": 0.7723,
      "step": 856
    },
    {
      "epoch": 0.32422207509694506,
      "grad_norm": 1.053365311188067,
      "learning_rate": 7.998738174243111e-06,
      "loss": 0.8102,
      "step": 857
    },
    {
      "epoch": 0.32460039723824835,
      "grad_norm": 1.0581107038361595,
      "learning_rate": 7.99869351371304e-06,
      "loss": 0.7999,
      "step": 858
    },
    {
      "epoch": 0.3249787193795517,
      "grad_norm": 1.1008953546338276,
      "learning_rate": 7.998648076648702e-06,
      "loss": 0.8568,
      "step": 859
    },
    {
      "epoch": 0.325357041520855,
      "grad_norm": 1.1417115474045594,
      "learning_rate": 7.998601863058915e-06,
      "loss": 0.8183,
      "step": 860
    },
    {
      "epoch": 0.32573536366215833,
      "grad_norm": 1.0221082409435902,
      "learning_rate": 7.998554872952656e-06,
      "loss": 0.8236,
      "step": 861
    },
    {
      "epoch": 0.3261136858034617,
      "grad_norm": 1.0319653291858766,
      "learning_rate": 7.99850710633905e-06,
      "loss": 0.8268,
      "step": 862
    },
    {
      "epoch": 0.32649200794476496,
      "grad_norm": 1.0741619232930077,
      "learning_rate": 7.998458563227374e-06,
      "loss": 0.8635,
      "step": 863
    },
    {
      "epoch": 0.3268703300860683,
      "grad_norm": 1.084988318258729,
      "learning_rate": 7.998409243627051e-06,
      "loss": 0.807,
      "step": 864
    },
    {
      "epoch": 0.3272486522273716,
      "grad_norm": 1.0687498037098355,
      "learning_rate": 7.998359147547665e-06,
      "loss": 0.852,
      "step": 865
    },
    {
      "epoch": 0.32762697436867494,
      "grad_norm": 1.125647258256957,
      "learning_rate": 7.99830827499894e-06,
      "loss": 0.8153,
      "step": 866
    },
    {
      "epoch": 0.32800529650997823,
      "grad_norm": 1.1182770611625017,
      "learning_rate": 7.998256625990756e-06,
      "loss": 0.8103,
      "step": 867
    },
    {
      "epoch": 0.3283836186512816,
      "grad_norm": 1.0564435912408205,
      "learning_rate": 7.998204200533144e-06,
      "loss": 0.8119,
      "step": 868
    },
    {
      "epoch": 0.32876194079258486,
      "grad_norm": 1.1460131223742922,
      "learning_rate": 7.998150998636284e-06,
      "loss": 0.8289,
      "step": 869
    },
    {
      "epoch": 0.3291402629338882,
      "grad_norm": 1.0575674306051868,
      "learning_rate": 7.998097020310509e-06,
      "loss": 0.8428,
      "step": 870
    },
    {
      "epoch": 0.32951858507519155,
      "grad_norm": 1.1137833102998567,
      "learning_rate": 7.9980422655663e-06,
      "loss": 0.8218,
      "step": 871
    },
    {
      "epoch": 0.32989690721649484,
      "grad_norm": 1.1107427833797017,
      "learning_rate": 7.997986734414291e-06,
      "loss": 0.851,
      "step": 872
    },
    {
      "epoch": 0.3302752293577982,
      "grad_norm": 1.1272405856822123,
      "learning_rate": 7.997930426865266e-06,
      "loss": 0.8604,
      "step": 873
    },
    {
      "epoch": 0.3306535514991015,
      "grad_norm": 1.0539626107226423,
      "learning_rate": 7.997873342930158e-06,
      "loss": 0.8531,
      "step": 874
    },
    {
      "epoch": 0.3310318736404048,
      "grad_norm": 1.0696538969484604,
      "learning_rate": 7.997815482620057e-06,
      "loss": 0.838,
      "step": 875
    },
    {
      "epoch": 0.3314101957817081,
      "grad_norm": 1.1460143163401961,
      "learning_rate": 7.997756845946193e-06,
      "loss": 0.7944,
      "step": 876
    },
    {
      "epoch": 0.33178851792301145,
      "grad_norm": 1.1082280014219137,
      "learning_rate": 7.997697432919957e-06,
      "loss": 0.9019,
      "step": 877
    },
    {
      "epoch": 0.33216684006431474,
      "grad_norm": 1.0841358926479614,
      "learning_rate": 7.997637243552888e-06,
      "loss": 0.7975,
      "step": 878
    },
    {
      "epoch": 0.3325451622056181,
      "grad_norm": 1.056009898365743,
      "learning_rate": 7.997576277856674e-06,
      "loss": 0.8574,
      "step": 879
    },
    {
      "epoch": 0.33292348434692143,
      "grad_norm": 1.0802951235255627,
      "learning_rate": 7.99751453584315e-06,
      "loss": 0.8155,
      "step": 880
    },
    {
      "epoch": 0.3333018064882247,
      "grad_norm": 1.077889148763545,
      "learning_rate": 7.99745201752431e-06,
      "loss": 0.7963,
      "step": 881
    },
    {
      "epoch": 0.33368012862952806,
      "grad_norm": 1.1621065299950686,
      "learning_rate": 7.997388722912295e-06,
      "loss": 0.8548,
      "step": 882
    },
    {
      "epoch": 0.33405845077083135,
      "grad_norm": 1.1322105218350456,
      "learning_rate": 7.997324652019394e-06,
      "loss": 0.8795,
      "step": 883
    },
    {
      "epoch": 0.3344367729121347,
      "grad_norm": 1.136478913491314,
      "learning_rate": 7.997259804858054e-06,
      "loss": 0.8053,
      "step": 884
    },
    {
      "epoch": 0.334815095053438,
      "grad_norm": 1.132941842896281,
      "learning_rate": 7.997194181440863e-06,
      "loss": 0.8753,
      "step": 885
    },
    {
      "epoch": 0.3351934171947413,
      "grad_norm": 1.072088751980564,
      "learning_rate": 7.997127781780567e-06,
      "loss": 0.8471,
      "step": 886
    },
    {
      "epoch": 0.3355717393360446,
      "grad_norm": 1.136959198020949,
      "learning_rate": 7.997060605890062e-06,
      "loss": 0.8805,
      "step": 887
    },
    {
      "epoch": 0.33595006147734796,
      "grad_norm": 1.1411444801682626,
      "learning_rate": 7.996992653782392e-06,
      "loss": 0.8241,
      "step": 888
    },
    {
      "epoch": 0.3363283836186513,
      "grad_norm": 1.0911333474121823,
      "learning_rate": 7.996923925470752e-06,
      "loss": 0.8134,
      "step": 889
    },
    {
      "epoch": 0.3367067057599546,
      "grad_norm": 1.0929540349841498,
      "learning_rate": 7.996854420968492e-06,
      "loss": 0.8362,
      "step": 890
    },
    {
      "epoch": 0.33708502790125794,
      "grad_norm": 1.1142134518728692,
      "learning_rate": 7.996784140289106e-06,
      "loss": 0.8583,
      "step": 891
    },
    {
      "epoch": 0.3374633500425612,
      "grad_norm": 1.0776120467255657,
      "learning_rate": 7.996713083446245e-06,
      "loss": 0.8405,
      "step": 892
    },
    {
      "epoch": 0.33784167218386457,
      "grad_norm": 1.0315550349351374,
      "learning_rate": 7.996641250453707e-06,
      "loss": 0.8233,
      "step": 893
    },
    {
      "epoch": 0.33821999432516786,
      "grad_norm": 1.1320956870150307,
      "learning_rate": 7.996568641325441e-06,
      "loss": 0.8497,
      "step": 894
    },
    {
      "epoch": 0.3385983164664712,
      "grad_norm": 1.0891148355471727,
      "learning_rate": 7.996495256075548e-06,
      "loss": 0.8338,
      "step": 895
    },
    {
      "epoch": 0.3389766386077745,
      "grad_norm": 1.1104610577848222,
      "learning_rate": 7.99642109471828e-06,
      "loss": 0.8166,
      "step": 896
    },
    {
      "epoch": 0.33935496074907784,
      "grad_norm": 1.0961276245110951,
      "learning_rate": 7.996346157268037e-06,
      "loss": 0.8213,
      "step": 897
    },
    {
      "epoch": 0.3397332828903812,
      "grad_norm": 1.053397674073016,
      "learning_rate": 7.996270443739375e-06,
      "loss": 0.8269,
      "step": 898
    },
    {
      "epoch": 0.34011160503168447,
      "grad_norm": 1.05985869383675,
      "learning_rate": 7.996193954146995e-06,
      "loss": 0.8632,
      "step": 899
    },
    {
      "epoch": 0.3404899271729878,
      "grad_norm": 1.0747332831609127,
      "learning_rate": 7.996116688505749e-06,
      "loss": 0.8308,
      "step": 900
    },
    {
      "epoch": 0.3408682493142911,
      "grad_norm": 1.0617958908539586,
      "learning_rate": 7.996038646830645e-06,
      "loss": 0.8003,
      "step": 901
    },
    {
      "epoch": 0.34124657145559445,
      "grad_norm": 1.0595674189471762,
      "learning_rate": 7.995959829136837e-06,
      "loss": 0.7948,
      "step": 902
    },
    {
      "epoch": 0.34162489359689774,
      "grad_norm": 1.0753382871745762,
      "learning_rate": 7.995880235439632e-06,
      "loss": 0.8399,
      "step": 903
    },
    {
      "epoch": 0.3420032157382011,
      "grad_norm": 1.1183441140434693,
      "learning_rate": 7.995799865754487e-06,
      "loss": 0.8221,
      "step": 904
    },
    {
      "epoch": 0.34238153787950437,
      "grad_norm": 1.0929766123596374,
      "learning_rate": 7.995718720097011e-06,
      "loss": 0.8309,
      "step": 905
    },
    {
      "epoch": 0.3427598600208077,
      "grad_norm": 1.0179073548109145,
      "learning_rate": 7.995636798482959e-06,
      "loss": 0.8355,
      "step": 906
    },
    {
      "epoch": 0.34313818216211106,
      "grad_norm": 1.1183732645745317,
      "learning_rate": 7.99555410092824e-06,
      "loss": 0.8376,
      "step": 907
    },
    {
      "epoch": 0.34351650430341435,
      "grad_norm": 1.165733705514543,
      "learning_rate": 7.995470627448915e-06,
      "loss": 0.86,
      "step": 908
    },
    {
      "epoch": 0.3438948264447177,
      "grad_norm": 1.0552618018743587,
      "learning_rate": 7.995386378061196e-06,
      "loss": 0.8468,
      "step": 909
    },
    {
      "epoch": 0.344273148586021,
      "grad_norm": 1.131651010498469,
      "learning_rate": 7.995301352781439e-06,
      "loss": 0.8489,
      "step": 910
    },
    {
      "epoch": 0.3446514707273243,
      "grad_norm": 1.1028826199732988,
      "learning_rate": 7.995215551626162e-06,
      "loss": 0.8721,
      "step": 911
    },
    {
      "epoch": 0.3450297928686276,
      "grad_norm": 1.1380255943103783,
      "learning_rate": 7.995128974612022e-06,
      "loss": 0.8484,
      "step": 912
    },
    {
      "epoch": 0.34540811500993096,
      "grad_norm": 1.0659393620350812,
      "learning_rate": 7.995041621755835e-06,
      "loss": 0.8198,
      "step": 913
    },
    {
      "epoch": 0.34578643715123425,
      "grad_norm": 1.059819166817385,
      "learning_rate": 7.994953493074562e-06,
      "loss": 0.8601,
      "step": 914
    },
    {
      "epoch": 0.3461647592925376,
      "grad_norm": 1.1168724106612267,
      "learning_rate": 7.994864588585323e-06,
      "loss": 0.8314,
      "step": 915
    },
    {
      "epoch": 0.34654308143384094,
      "grad_norm": 1.0696755810222651,
      "learning_rate": 7.994774908305377e-06,
      "loss": 0.8488,
      "step": 916
    },
    {
      "epoch": 0.3469214035751442,
      "grad_norm": 1.1571812110459856,
      "learning_rate": 7.99468445225214e-06,
      "loss": 0.8157,
      "step": 917
    },
    {
      "epoch": 0.34729972571644757,
      "grad_norm": 1.114611745775756,
      "learning_rate": 7.994593220443181e-06,
      "loss": 0.8368,
      "step": 918
    },
    {
      "epoch": 0.34767804785775086,
      "grad_norm": 1.152864146273239,
      "learning_rate": 7.994501212896218e-06,
      "loss": 0.861,
      "step": 919
    },
    {
      "epoch": 0.3480563699990542,
      "grad_norm": 1.1345158690879138,
      "learning_rate": 7.994408429629113e-06,
      "loss": 0.8163,
      "step": 920
    },
    {
      "epoch": 0.3484346921403575,
      "grad_norm": 1.0577940861565938,
      "learning_rate": 7.994314870659892e-06,
      "loss": 0.7803,
      "step": 921
    },
    {
      "epoch": 0.34881301428166084,
      "grad_norm": 1.04106331488491,
      "learning_rate": 7.994220536006717e-06,
      "loss": 0.8291,
      "step": 922
    },
    {
      "epoch": 0.3491913364229641,
      "grad_norm": 1.0394935151014175,
      "learning_rate": 7.99412542568791e-06,
      "loss": 0.7819,
      "step": 923
    },
    {
      "epoch": 0.34956965856426747,
      "grad_norm": 1.1306507694533081,
      "learning_rate": 7.994029539721941e-06,
      "loss": 0.8594,
      "step": 924
    },
    {
      "epoch": 0.3499479807055708,
      "grad_norm": 1.0984697906601044,
      "learning_rate": 7.993932878127433e-06,
      "loss": 0.872,
      "step": 925
    },
    {
      "epoch": 0.3503263028468741,
      "grad_norm": 1.0848529154386723,
      "learning_rate": 7.993835440923154e-06,
      "loss": 0.8668,
      "step": 926
    },
    {
      "epoch": 0.35070462498817745,
      "grad_norm": 1.074249076888769,
      "learning_rate": 7.993737228128028e-06,
      "loss": 0.88,
      "step": 927
    },
    {
      "epoch": 0.35108294712948074,
      "grad_norm": 1.0595559434730502,
      "learning_rate": 7.993638239761127e-06,
      "loss": 0.8448,
      "step": 928
    },
    {
      "epoch": 0.3514612692707841,
      "grad_norm": 1.0586225742216135,
      "learning_rate": 7.993538475841674e-06,
      "loss": 0.806,
      "step": 929
    },
    {
      "epoch": 0.35183959141208737,
      "grad_norm": 1.0965639423993851,
      "learning_rate": 7.993437936389045e-06,
      "loss": 0.8532,
      "step": 930
    },
    {
      "epoch": 0.3522179135533907,
      "grad_norm": 1.0635648509605742,
      "learning_rate": 7.99333662142276e-06,
      "loss": 0.8659,
      "step": 931
    },
    {
      "epoch": 0.3522179135533907,
      "eval_loss": 0.8405433893203735,
      "eval_runtime": 26.7827,
      "eval_samples_per_second": 33.044,
      "eval_steps_per_second": 1.045,
      "step": 931
    },
    {
      "epoch": 0.3522179135533907,
      "eval_bench_accuracy_arc_challenge": 0.2,
      "eval_bench_accuracy_hellaswag": 0.265,
      "eval_bench_accuracy_mmlu": 0.20869565217391303,
      "eval_bench_average_accuracy": 0.22456521739130433,
      "eval_bench_loss": 4.116911503306606,
      "eval_bench_total_accuracy": 0.23076923076923078,
      "step": 931
    },
    {
      "epoch": 0.352596235694694,
      "grad_norm": 1.071445968085627,
      "learning_rate": 7.993234530962498e-06,
      "loss": 0.8349,
      "step": 932
    },
    {
      "epoch": 0.35297455783599735,
      "grad_norm": 1.1138872222419933,
      "learning_rate": 7.993131665028082e-06,
      "loss": 0.8369,
      "step": 933
    },
    {
      "epoch": 0.3533528799773007,
      "grad_norm": 1.034081458809988,
      "learning_rate": 7.993028023639493e-06,
      "loss": 0.8302,
      "step": 934
    },
    {
      "epoch": 0.353731202118604,
      "grad_norm": 1.0615568247982479,
      "learning_rate": 7.992923606816852e-06,
      "loss": 0.7956,
      "step": 935
    },
    {
      "epoch": 0.3541095242599073,
      "grad_norm": 1.0966324306911683,
      "learning_rate": 7.992818414580439e-06,
      "loss": 0.8157,
      "step": 936
    },
    {
      "epoch": 0.3544878464012106,
      "grad_norm": 1.0499428116789347,
      "learning_rate": 7.992712446950682e-06,
      "loss": 0.8448,
      "step": 937
    },
    {
      "epoch": 0.35486616854251396,
      "grad_norm": 1.0929166781794446,
      "learning_rate": 7.99260570394816e-06,
      "loss": 0.838,
      "step": 938
    },
    {
      "epoch": 0.35524449068381725,
      "grad_norm": 1.0784478665113866,
      "learning_rate": 7.9924981855936e-06,
      "loss": 0.8477,
      "step": 939
    },
    {
      "epoch": 0.3556228128251206,
      "grad_norm": 1.112873701673093,
      "learning_rate": 7.992389891907885e-06,
      "loss": 0.837,
      "step": 940
    },
    {
      "epoch": 0.3560011349664239,
      "grad_norm": 1.0396578216523251,
      "learning_rate": 7.992280822912044e-06,
      "loss": 0.7867,
      "step": 941
    },
    {
      "epoch": 0.3563794571077272,
      "grad_norm": 1.1025438788531285,
      "learning_rate": 7.992170978627258e-06,
      "loss": 0.8588,
      "step": 942
    },
    {
      "epoch": 0.35675777924903057,
      "grad_norm": 1.0567533995232752,
      "learning_rate": 7.992060359074857e-06,
      "loss": 0.8415,
      "step": 943
    },
    {
      "epoch": 0.35713610139033386,
      "grad_norm": 1.0876544163342308,
      "learning_rate": 7.991948964276324e-06,
      "loss": 0.8139,
      "step": 944
    },
    {
      "epoch": 0.3575144235316372,
      "grad_norm": 1.1119965568409491,
      "learning_rate": 7.991836794253291e-06,
      "loss": 0.8236,
      "step": 945
    },
    {
      "epoch": 0.3578927456729405,
      "grad_norm": 1.050449035576396,
      "learning_rate": 7.991723849027543e-06,
      "loss": 0.8683,
      "step": 946
    },
    {
      "epoch": 0.35827106781424384,
      "grad_norm": 1.0727809938491701,
      "learning_rate": 7.991610128621012e-06,
      "loss": 0.8637,
      "step": 947
    },
    {
      "epoch": 0.3586493899555471,
      "grad_norm": 1.1142250081446294,
      "learning_rate": 7.991495633055782e-06,
      "loss": 0.8173,
      "step": 948
    },
    {
      "epoch": 0.35902771209685047,
      "grad_norm": 1.0422992081938323,
      "learning_rate": 7.99138036235409e-06,
      "loss": 0.8247,
      "step": 949
    },
    {
      "epoch": 0.3594060342381538,
      "grad_norm": 1.0683985452632145,
      "learning_rate": 7.991264316538315e-06,
      "loss": 0.7835,
      "step": 950
    },
    {
      "epoch": 0.3597843563794571,
      "grad_norm": 1.1389275468673155,
      "learning_rate": 7.991147495631001e-06,
      "loss": 0.8263,
      "step": 951
    },
    {
      "epoch": 0.36016267852076045,
      "grad_norm": 1.0300732494637694,
      "learning_rate": 7.99102989965483e-06,
      "loss": 0.8382,
      "step": 952
    },
    {
      "epoch": 0.36054100066206374,
      "grad_norm": 1.1134877059951171,
      "learning_rate": 7.990911528632637e-06,
      "loss": 0.8301,
      "step": 953
    },
    {
      "epoch": 0.3609193228033671,
      "grad_norm": 1.1556214956120872,
      "learning_rate": 7.990792382587413e-06,
      "loss": 0.8339,
      "step": 954
    },
    {
      "epoch": 0.36129764494467037,
      "grad_norm": 1.0496596260111375,
      "learning_rate": 7.990672461542295e-06,
      "loss": 0.855,
      "step": 955
    },
    {
      "epoch": 0.3616759670859737,
      "grad_norm": 1.0631933354628074,
      "learning_rate": 7.99055176552057e-06,
      "loss": 0.8028,
      "step": 956
    },
    {
      "epoch": 0.362054289227277,
      "grad_norm": 1.112630845203049,
      "learning_rate": 7.990430294545676e-06,
      "loss": 0.8324,
      "step": 957
    },
    {
      "epoch": 0.36243261136858035,
      "grad_norm": 1.047199242259213,
      "learning_rate": 7.990308048641205e-06,
      "loss": 0.8113,
      "step": 958
    },
    {
      "epoch": 0.3628109335098837,
      "grad_norm": 1.027441822648717,
      "learning_rate": 7.990185027830895e-06,
      "loss": 0.818,
      "step": 959
    },
    {
      "epoch": 0.363189255651187,
      "grad_norm": 1.1215384265121908,
      "learning_rate": 7.990061232138636e-06,
      "loss": 0.8105,
      "step": 960
    },
    {
      "epoch": 0.3635675777924903,
      "grad_norm": 1.068442952320319,
      "learning_rate": 7.989936661588471e-06,
      "loss": 0.7921,
      "step": 961
    },
    {
      "epoch": 0.3639458999337936,
      "grad_norm": 1.1092839541563482,
      "learning_rate": 7.989811316204588e-06,
      "loss": 0.8604,
      "step": 962
    },
    {
      "epoch": 0.36432422207509696,
      "grad_norm": 1.071801311807864,
      "learning_rate": 7.989685196011332e-06,
      "loss": 0.8309,
      "step": 963
    },
    {
      "epoch": 0.36470254421640025,
      "grad_norm": 1.0755045364863953,
      "learning_rate": 7.989558301033193e-06,
      "loss": 0.8281,
      "step": 964
    },
    {
      "epoch": 0.3650808663577036,
      "grad_norm": 1.0267320983799983,
      "learning_rate": 7.989430631294813e-06,
      "loss": 0.8354,
      "step": 965
    },
    {
      "epoch": 0.3654591884990069,
      "grad_norm": 1.137253491825624,
      "learning_rate": 7.98930218682099e-06,
      "loss": 0.879,
      "step": 966
    },
    {
      "epoch": 0.3658375106403102,
      "grad_norm": 1.078336142946193,
      "learning_rate": 7.989172967636661e-06,
      "loss": 0.7937,
      "step": 967
    },
    {
      "epoch": 0.36621583278161357,
      "grad_norm": 1.249220122221408,
      "learning_rate": 7.98904297376692e-06,
      "loss": 0.8719,
      "step": 968
    },
    {
      "epoch": 0.36659415492291686,
      "grad_norm": 1.0553052489470098,
      "learning_rate": 7.988912205237018e-06,
      "loss": 0.8343,
      "step": 969
    },
    {
      "epoch": 0.3669724770642202,
      "grad_norm": 1.0825650361601242,
      "learning_rate": 7.988780662072345e-06,
      "loss": 0.8708,
      "step": 970
    },
    {
      "epoch": 0.3673507992055235,
      "grad_norm": 1.0492113257783737,
      "learning_rate": 7.988648344298449e-06,
      "loss": 0.8158,
      "step": 971
    },
    {
      "epoch": 0.36772912134682684,
      "grad_norm": 1.1098170719484017,
      "learning_rate": 7.988515251941022e-06,
      "loss": 0.8072,
      "step": 972
    },
    {
      "epoch": 0.3681074434881301,
      "grad_norm": 1.0470408388006793,
      "learning_rate": 7.988381385025913e-06,
      "loss": 0.8254,
      "step": 973
    },
    {
      "epoch": 0.36848576562943347,
      "grad_norm": 1.1223023650314936,
      "learning_rate": 7.988246743579118e-06,
      "loss": 0.8422,
      "step": 974
    },
    {
      "epoch": 0.36886408777073676,
      "grad_norm": 1.0378189816707217,
      "learning_rate": 7.988111327626781e-06,
      "loss": 0.7986,
      "step": 975
    },
    {
      "epoch": 0.3692424099120401,
      "grad_norm": 1.0879026599404655,
      "learning_rate": 7.987975137195206e-06,
      "loss": 0.8239,
      "step": 976
    },
    {
      "epoch": 0.36962073205334345,
      "grad_norm": 1.0445944467404071,
      "learning_rate": 7.987838172310836e-06,
      "loss": 0.7856,
      "step": 977
    },
    {
      "epoch": 0.36999905419464674,
      "grad_norm": 1.0952504464513027,
      "learning_rate": 7.987700433000268e-06,
      "loss": 0.8474,
      "step": 978
    },
    {
      "epoch": 0.3703773763359501,
      "grad_norm": 1.0976482765823483,
      "learning_rate": 7.987561919290254e-06,
      "loss": 0.8067,
      "step": 979
    },
    {
      "epoch": 0.37075569847725337,
      "grad_norm": 1.0673215016151512,
      "learning_rate": 7.987422631207691e-06,
      "loss": 0.7747,
      "step": 980
    },
    {
      "epoch": 0.3711340206185567,
      "grad_norm": 1.1205110055136513,
      "learning_rate": 7.98728256877963e-06,
      "loss": 0.7892,
      "step": 981
    },
    {
      "epoch": 0.37151234275986,
      "grad_norm": 1.092436787430483,
      "learning_rate": 7.987141732033268e-06,
      "loss": 0.8332,
      "step": 982
    },
    {
      "epoch": 0.37189066490116335,
      "grad_norm": 1.091564370951629,
      "learning_rate": 7.987000120995958e-06,
      "loss": 0.8318,
      "step": 983
    },
    {
      "epoch": 0.37226898704246664,
      "grad_norm": 1.0840271784135682,
      "learning_rate": 7.986857735695197e-06,
      "loss": 0.8343,
      "step": 984
    },
    {
      "epoch": 0.37264730918377,
      "grad_norm": 1.1224128911012572,
      "learning_rate": 7.98671457615864e-06,
      "loss": 0.8084,
      "step": 985
    },
    {
      "epoch": 0.3730256313250733,
      "grad_norm": 1.0744788507306402,
      "learning_rate": 7.986570642414086e-06,
      "loss": 0.8468,
      "step": 986
    },
    {
      "epoch": 0.3734039534663766,
      "grad_norm": 1.0627524449061605,
      "learning_rate": 7.986425934489486e-06,
      "loss": 0.794,
      "step": 987
    },
    {
      "epoch": 0.37378227560767996,
      "grad_norm": 1.1606049685680029,
      "learning_rate": 7.986280452412942e-06,
      "loss": 0.8599,
      "step": 988
    },
    {
      "epoch": 0.37416059774898325,
      "grad_norm": 1.1453346028219251,
      "learning_rate": 7.986134196212707e-06,
      "loss": 0.839,
      "step": 989
    },
    {
      "epoch": 0.3745389198902866,
      "grad_norm": 1.047560845313498,
      "learning_rate": 7.985987165917182e-06,
      "loss": 0.838,
      "step": 990
    },
    {
      "epoch": 0.3749172420315899,
      "grad_norm": 1.0691648190671164,
      "learning_rate": 7.985839361554922e-06,
      "loss": 0.8349,
      "step": 991
    },
    {
      "epoch": 0.3752955641728932,
      "grad_norm": 1.0728147519090105,
      "learning_rate": 7.985690783154628e-06,
      "loss": 0.8082,
      "step": 992
    },
    {
      "epoch": 0.3756738863141965,
      "grad_norm": 1.0710609346244502,
      "learning_rate": 7.985541430745155e-06,
      "loss": 0.8367,
      "step": 993
    },
    {
      "epoch": 0.37605220845549986,
      "grad_norm": 1.0345097180466358,
      "learning_rate": 7.985391304355508e-06,
      "loss": 0.8235,
      "step": 994
    },
    {
      "epoch": 0.3764305305968032,
      "grad_norm": 1.0627329252549442,
      "learning_rate": 7.985240404014836e-06,
      "loss": 0.8361,
      "step": 995
    },
    {
      "epoch": 0.3768088527381065,
      "grad_norm": 1.055170154515539,
      "learning_rate": 7.98508872975245e-06,
      "loss": 0.7913,
      "step": 996
    },
    {
      "epoch": 0.37718717487940984,
      "grad_norm": 1.0799095201174227,
      "learning_rate": 7.9849362815978e-06,
      "loss": 0.8143,
      "step": 997
    },
    {
      "epoch": 0.3775654970207131,
      "grad_norm": 1.1004168575034028,
      "learning_rate": 7.984783059580493e-06,
      "loss": 0.8325,
      "step": 998
    },
    {
      "epoch": 0.37794381916201647,
      "grad_norm": 1.064297565177233,
      "learning_rate": 7.984629063730284e-06,
      "loss": 0.7825,
      "step": 999
    },
    {
      "epoch": 0.37832214130331976,
      "grad_norm": 1.0635329039354893,
      "learning_rate": 7.984474294077078e-06,
      "loss": 0.843,
      "step": 1000
    },
    {
      "epoch": 0.3787004634446231,
      "grad_norm": 1.0134149947950788,
      "learning_rate": 7.98431875065093e-06,
      "loss": 0.8407,
      "step": 1001
    },
    {
      "epoch": 0.3790787855859264,
      "grad_norm": 1.1003240739229772,
      "learning_rate": 7.984162433482048e-06,
      "loss": 0.8757,
      "step": 1002
    },
    {
      "epoch": 0.37945710772722974,
      "grad_norm": 1.0704123729576063,
      "learning_rate": 7.984005342600789e-06,
      "loss": 0.8385,
      "step": 1003
    },
    {
      "epoch": 0.3798354298685331,
      "grad_norm": 1.082489049237877,
      "learning_rate": 7.983847478037655e-06,
      "loss": 0.8494,
      "step": 1004
    },
    {
      "epoch": 0.38021375200983637,
      "grad_norm": 1.080752264367249,
      "learning_rate": 7.983688839823308e-06,
      "loss": 0.8609,
      "step": 1005
    },
    {
      "epoch": 0.3805920741511397,
      "grad_norm": 1.1968418204384677,
      "learning_rate": 7.983529427988552e-06,
      "loss": 0.8564,
      "step": 1006
    },
    {
      "epoch": 0.380970396292443,
      "grad_norm": 1.061469890379153,
      "learning_rate": 7.983369242564346e-06,
      "loss": 0.7891,
      "step": 1007
    },
    {
      "epoch": 0.38134871843374635,
      "grad_norm": 1.0621745023983624,
      "learning_rate": 7.983208283581796e-06,
      "loss": 0.864,
      "step": 1008
    },
    {
      "epoch": 0.38172704057504964,
      "grad_norm": 1.1002758271639341,
      "learning_rate": 7.98304655107216e-06,
      "loss": 0.8511,
      "step": 1009
    },
    {
      "epoch": 0.382105362716353,
      "grad_norm": 1.2982365803931801,
      "learning_rate": 7.982884045066848e-06,
      "loss": 0.8707,
      "step": 1010
    },
    {
      "epoch": 0.38248368485765627,
      "grad_norm": 1.0481998500890215,
      "learning_rate": 7.982720765597416e-06,
      "loss": 0.808,
      "step": 1011
    },
    {
      "epoch": 0.3828620069989596,
      "grad_norm": 1.0843657280284922,
      "learning_rate": 7.982556712695573e-06,
      "loss": 0.8033,
      "step": 1012
    },
    {
      "epoch": 0.38324032914026296,
      "grad_norm": 1.056797859890995,
      "learning_rate": 7.982391886393176e-06,
      "loss": 0.8109,
      "step": 1013
    },
    {
      "epoch": 0.38361865128156625,
      "grad_norm": 1.060307047043872,
      "learning_rate": 7.982226286722239e-06,
      "loss": 0.8485,
      "step": 1014
    },
    {
      "epoch": 0.3839969734228696,
      "grad_norm": 1.0880414860647125,
      "learning_rate": 7.982059913714915e-06,
      "loss": 0.829,
      "step": 1015
    },
    {
      "epoch": 0.3843752955641729,
      "grad_norm": 1.0647653565219015,
      "learning_rate": 7.981892767403516e-06,
      "loss": 0.831,
      "step": 1016
    },
    {
      "epoch": 0.3847536177054762,
      "grad_norm": 1.1245340497823308,
      "learning_rate": 7.9817248478205e-06,
      "loss": 0.8633,
      "step": 1017
    },
    {
      "epoch": 0.3851319398467795,
      "grad_norm": 1.083643967559738,
      "learning_rate": 7.981556154998477e-06,
      "loss": 0.8694,
      "step": 1018
    },
    {
      "epoch": 0.38551026198808286,
      "grad_norm": 1.0892685401414424,
      "learning_rate": 7.981386688970209e-06,
      "loss": 0.8455,
      "step": 1019
    },
    {
      "epoch": 0.38588858412938615,
      "grad_norm": 1.080573813534876,
      "learning_rate": 7.981216449768603e-06,
      "loss": 0.8028,
      "step": 1020
    },
    {
      "epoch": 0.3862669062706895,
      "grad_norm": 1.0697257333484091,
      "learning_rate": 7.981045437426718e-06,
      "loss": 0.8254,
      "step": 1021
    },
    {
      "epoch": 0.38664522841199284,
      "grad_norm": 1.1482898982014345,
      "learning_rate": 7.980873651977768e-06,
      "loss": 0.8434,
      "step": 1022
    },
    {
      "epoch": 0.3870235505532961,
      "grad_norm": 1.066295131291774,
      "learning_rate": 7.98070109345511e-06,
      "loss": 0.7966,
      "step": 1023
    },
    {
      "epoch": 0.38740187269459947,
      "grad_norm": 1.0329631074824188,
      "learning_rate": 7.980527761892255e-06,
      "loss": 0.7914,
      "step": 1024
    },
    {
      "epoch": 0.38778019483590276,
      "grad_norm": 1.0857069666875103,
      "learning_rate": 7.980353657322863e-06,
      "loss": 0.8622,
      "step": 1025
    },
    {
      "epoch": 0.3881585169772061,
      "grad_norm": 1.060211010001084,
      "learning_rate": 7.980178779780747e-06,
      "loss": 0.8381,
      "step": 1026
    },
    {
      "epoch": 0.3885368391185094,
      "grad_norm": 1.0543634996329088,
      "learning_rate": 7.980003129299865e-06,
      "loss": 0.8378,
      "step": 1027
    },
    {
      "epoch": 0.38891516125981274,
      "grad_norm": 1.1081388338013471,
      "learning_rate": 7.979826705914328e-06,
      "loss": 0.8338,
      "step": 1028
    },
    {
      "epoch": 0.389293483401116,
      "grad_norm": 1.104557100267363,
      "learning_rate": 7.9796495096584e-06,
      "loss": 0.795,
      "step": 1029
    },
    {
      "epoch": 0.38967180554241937,
      "grad_norm": 1.0655072241835162,
      "learning_rate": 7.979471540566489e-06,
      "loss": 0.8237,
      "step": 1030
    },
    {
      "epoch": 0.3900501276837227,
      "grad_norm": 1.0796326933387017,
      "learning_rate": 7.979292798673156e-06,
      "loss": 0.8556,
      "step": 1031
    },
    {
      "epoch": 0.390428449825026,
      "grad_norm": 1.0380712383913533,
      "learning_rate": 7.979113284013114e-06,
      "loss": 0.839,
      "step": 1032
    },
    {
      "epoch": 0.39080677196632935,
      "grad_norm": 1.085425876568373,
      "learning_rate": 7.97893299662122e-06,
      "loss": 0.8516,
      "step": 1033
    },
    {
      "epoch": 0.39118509410763264,
      "grad_norm": 1.2207322749435598,
      "learning_rate": 7.978751936532491e-06,
      "loss": 0.8549,
      "step": 1034
    },
    {
      "epoch": 0.391563416248936,
      "grad_norm": 1.088319428223248,
      "learning_rate": 7.978570103782086e-06,
      "loss": 0.8573,
      "step": 1035
    },
    {
      "epoch": 0.39194173839023927,
      "grad_norm": 1.0545678177926456,
      "learning_rate": 7.978387498405317e-06,
      "loss": 0.8325,
      "step": 1036
    },
    {
      "epoch": 0.3923200605315426,
      "grad_norm": 1.0921146086499482,
      "learning_rate": 7.978204120437641e-06,
      "loss": 0.7912,
      "step": 1037
    },
    {
      "epoch": 0.3926983826728459,
      "grad_norm": 1.1156394836322963,
      "learning_rate": 7.978019969914676e-06,
      "loss": 0.8344,
      "step": 1038
    },
    {
      "epoch": 0.39307670481414925,
      "grad_norm": 1.1163141481746923,
      "learning_rate": 7.97783504687218e-06,
      "loss": 0.8039,
      "step": 1039
    },
    {
      "epoch": 0.3934550269554526,
      "grad_norm": 1.1055832393565042,
      "learning_rate": 7.977649351346065e-06,
      "loss": 0.8098,
      "step": 1040
    },
    {
      "epoch": 0.3938333490967559,
      "grad_norm": 1.0475102246909884,
      "learning_rate": 7.97746288337239e-06,
      "loss": 0.7868,
      "step": 1041
    },
    {
      "epoch": 0.3942116712380592,
      "grad_norm": 1.0630199431469338,
      "learning_rate": 7.977275642987371e-06,
      "loss": 0.7965,
      "step": 1042
    },
    {
      "epoch": 0.3945899933793625,
      "grad_norm": 1.1096476912788604,
      "learning_rate": 7.977087630227368e-06,
      "loss": 0.8052,
      "step": 1043
    },
    {
      "epoch": 0.39496831552066586,
      "grad_norm": 1.0863091134871783,
      "learning_rate": 7.976898845128891e-06,
      "loss": 0.8435,
      "step": 1044
    },
    {
      "epoch": 0.39534663766196915,
      "grad_norm": 1.0492836175021802,
      "learning_rate": 7.976709287728602e-06,
      "loss": 0.8083,
      "step": 1045
    },
    {
      "epoch": 0.3957249598032725,
      "grad_norm": 1.0529300466346392,
      "learning_rate": 7.976518958063315e-06,
      "loss": 0.8274,
      "step": 1046
    },
    {
      "epoch": 0.3961032819445758,
      "grad_norm": 1.070473727548606,
      "learning_rate": 7.976327856169989e-06,
      "loss": 0.7971,
      "step": 1047
    },
    {
      "epoch": 0.3964816040858791,
      "grad_norm": 1.0617092300636013,
      "learning_rate": 7.976135982085734e-06,
      "loss": 0.8536,
      "step": 1048
    },
    {
      "epoch": 0.39685992622718247,
      "grad_norm": 1.0606504595804507,
      "learning_rate": 7.975943335847815e-06,
      "loss": 0.777,
      "step": 1049
    },
    {
      "epoch": 0.39723824836848576,
      "grad_norm": 1.1335961432026964,
      "learning_rate": 7.97574991749364e-06,
      "loss": 0.8707,
      "step": 1050
    },
    {
      "epoch": 0.3976165705097891,
      "grad_norm": 1.0932495202458485,
      "learning_rate": 7.975555727060773e-06,
      "loss": 0.8476,
      "step": 1051
    },
    {
      "epoch": 0.3979948926510924,
      "grad_norm": 1.0904729718461323,
      "learning_rate": 7.975360764586923e-06,
      "loss": 0.8325,
      "step": 1052
    },
    {
      "epoch": 0.39837321479239574,
      "grad_norm": 1.060481887356713,
      "learning_rate": 7.975165030109953e-06,
      "loss": 0.8293,
      "step": 1053
    },
    {
      "epoch": 0.398751536933699,
      "grad_norm": 1.0594136483291037,
      "learning_rate": 7.974968523667874e-06,
      "loss": 0.8333,
      "step": 1054
    },
    {
      "epoch": 0.39912985907500237,
      "grad_norm": 1.072066755016977,
      "learning_rate": 7.974771245298845e-06,
      "loss": 0.8588,
      "step": 1055
    },
    {
      "epoch": 0.39950818121630566,
      "grad_norm": 1.0407488984374065,
      "learning_rate": 7.974573195041179e-06,
      "loss": 0.8119,
      "step": 1056
    },
    {
      "epoch": 0.399886503357609,
      "grad_norm": 1.0897696384583164,
      "learning_rate": 7.974374372933333e-06,
      "loss": 0.8729,
      "step": 1057
    },
    {
      "epoch": 0.40026482549891235,
      "grad_norm": 1.0395716067441272,
      "learning_rate": 7.974174779013923e-06,
      "loss": 0.844,
      "step": 1058
    },
    {
      "epoch": 0.40064314764021564,
      "grad_norm": 1.0440432063315428,
      "learning_rate": 7.973974413321706e-06,
      "loss": 0.8311,
      "step": 1059
    },
    {
      "epoch": 0.401021469781519,
      "grad_norm": 1.085811930524537,
      "learning_rate": 7.973773275895593e-06,
      "loss": 0.8506,
      "step": 1060
    },
    {
      "epoch": 0.40139979192282227,
      "grad_norm": 1.017123583458792,
      "learning_rate": 7.973571366774646e-06,
      "loss": 0.7491,
      "step": 1061
    },
    {
      "epoch": 0.4017781140641256,
      "grad_norm": 1.041022717188848,
      "learning_rate": 7.973368685998074e-06,
      "loss": 0.8189,
      "step": 1062
    },
    {
      "epoch": 0.4021564362054289,
      "grad_norm": 1.0150607929017172,
      "learning_rate": 7.973165233605234e-06,
      "loss": 0.814,
      "step": 1063
    },
    {
      "epoch": 0.40253475834673225,
      "grad_norm": 1.0458554860554623,
      "learning_rate": 7.972961009635642e-06,
      "loss": 0.8123,
      "step": 1064
    },
    {
      "epoch": 0.40253475834673225,
      "eval_loss": 0.8304316997528076,
      "eval_runtime": 26.6669,
      "eval_samples_per_second": 33.187,
      "eval_steps_per_second": 1.05,
      "step": 1064
    },
    {
      "epoch": 0.40253475834673225,
      "eval_bench_accuracy_arc_challenge": 0.25,
      "eval_bench_accuracy_hellaswag": 0.285,
      "eval_bench_accuracy_mmlu": 0.2782608695652174,
      "eval_bench_average_accuracy": 0.2710869565217391,
      "eval_bench_loss": 4.517480147512336,
      "eval_bench_total_accuracy": 0.2725274725274725,
      "step": 1064
    },
    {
      "epoch": 0.40291308048803554,
      "grad_norm": 1.037409138160307,
      "learning_rate": 7.972756014128952e-06,
      "loss": 0.8159,
      "step": 1065
    },
    {
      "epoch": 0.4032914026293389,
      "grad_norm": 1.0836167448402902,
      "learning_rate": 7.972550247124976e-06,
      "loss": 0.8131,
      "step": 1066
    },
    {
      "epoch": 0.4036697247706422,
      "grad_norm": 1.0933137283571555,
      "learning_rate": 7.972343708663674e-06,
      "loss": 0.8183,
      "step": 1067
    },
    {
      "epoch": 0.4040480469119455,
      "grad_norm": 1.03216484709328,
      "learning_rate": 7.972136398785154e-06,
      "loss": 0.8569,
      "step": 1068
    },
    {
      "epoch": 0.40442636905324886,
      "grad_norm": 1.0656155608965763,
      "learning_rate": 7.971928317529676e-06,
      "loss": 0.8453,
      "step": 1069
    },
    {
      "epoch": 0.40480469119455215,
      "grad_norm": 1.0708238570639999,
      "learning_rate": 7.971719464937647e-06,
      "loss": 0.8367,
      "step": 1070
    },
    {
      "epoch": 0.4051830133358555,
      "grad_norm": 1.0621498480602682,
      "learning_rate": 7.971509841049628e-06,
      "loss": 0.8589,
      "step": 1071
    },
    {
      "epoch": 0.4055613354771588,
      "grad_norm": 1.0072315129856741,
      "learning_rate": 7.971299445906324e-06,
      "loss": 0.8379,
      "step": 1072
    },
    {
      "epoch": 0.4059396576184621,
      "grad_norm": 1.033456153626471,
      "learning_rate": 7.971088279548597e-06,
      "loss": 0.8079,
      "step": 1073
    },
    {
      "epoch": 0.4063179797597654,
      "grad_norm": 1.0079272901425842,
      "learning_rate": 7.970876342017452e-06,
      "loss": 0.7868,
      "step": 1074
    },
    {
      "epoch": 0.40669630190106876,
      "grad_norm": 1.0073805003714849,
      "learning_rate": 7.970663633354047e-06,
      "loss": 0.7988,
      "step": 1075
    },
    {
      "epoch": 0.4070746240423721,
      "grad_norm": 1.0708487426838318,
      "learning_rate": 7.97045015359969e-06,
      "loss": 0.8026,
      "step": 1076
    },
    {
      "epoch": 0.4074529461836754,
      "grad_norm": 1.069671541329999,
      "learning_rate": 7.970235902795838e-06,
      "loss": 0.8462,
      "step": 1077
    },
    {
      "epoch": 0.40783126832497874,
      "grad_norm": 1.0250427566221285,
      "learning_rate": 7.9700208809841e-06,
      "loss": 0.819,
      "step": 1078
    },
    {
      "epoch": 0.408209590466282,
      "grad_norm": 1.035811754086645,
      "learning_rate": 7.969805088206226e-06,
      "loss": 0.8192,
      "step": 1079
    },
    {
      "epoch": 0.40858791260758537,
      "grad_norm": 1.0919846226041652,
      "learning_rate": 7.96958852450413e-06,
      "loss": 0.8463,
      "step": 1080
    },
    {
      "epoch": 0.40896623474888866,
      "grad_norm": 1.0922304905923719,
      "learning_rate": 7.969371189919865e-06,
      "loss": 0.8505,
      "step": 1081
    },
    {
      "epoch": 0.409344556890192,
      "grad_norm": 1.0327335666733615,
      "learning_rate": 7.969153084495636e-06,
      "loss": 0.8054,
      "step": 1082
    },
    {
      "epoch": 0.4097228790314953,
      "grad_norm": 1.069756821894608,
      "learning_rate": 7.968934208273798e-06,
      "loss": 0.8348,
      "step": 1083
    },
    {
      "epoch": 0.41010120117279864,
      "grad_norm": 1.0472686446394408,
      "learning_rate": 7.968714561296859e-06,
      "loss": 0.8302,
      "step": 1084
    },
    {
      "epoch": 0.410479523314102,
      "grad_norm": 1.0462638623089058,
      "learning_rate": 7.96849414360747e-06,
      "loss": 0.8249,
      "step": 1085
    },
    {
      "epoch": 0.41085784545540527,
      "grad_norm": 1.0056327093077677,
      "learning_rate": 7.96827295524844e-06,
      "loss": 0.7795,
      "step": 1086
    },
    {
      "epoch": 0.4112361675967086,
      "grad_norm": 1.0244037556207601,
      "learning_rate": 7.968050996262716e-06,
      "loss": 0.7905,
      "step": 1087
    },
    {
      "epoch": 0.4116144897380119,
      "grad_norm": 1.0346973741005767,
      "learning_rate": 7.967828266693409e-06,
      "loss": 0.8371,
      "step": 1088
    },
    {
      "epoch": 0.41199281187931525,
      "grad_norm": 1.0958021967982934,
      "learning_rate": 7.96760476658377e-06,
      "loss": 0.8479,
      "step": 1089
    },
    {
      "epoch": 0.41237113402061853,
      "grad_norm": 1.0136255102022522,
      "learning_rate": 7.967380495977201e-06,
      "loss": 0.8055,
      "step": 1090
    },
    {
      "epoch": 0.4127494561619219,
      "grad_norm": 1.0687414316917077,
      "learning_rate": 7.967155454917255e-06,
      "loss": 0.8481,
      "step": 1091
    },
    {
      "epoch": 0.4131277783032252,
      "grad_norm": 1.0765456661292323,
      "learning_rate": 7.966929643447634e-06,
      "loss": 0.8115,
      "step": 1092
    },
    {
      "epoch": 0.4135061004445285,
      "grad_norm": 1.078258124622418,
      "learning_rate": 7.966703061612192e-06,
      "loss": 0.8319,
      "step": 1093
    },
    {
      "epoch": 0.41388442258583186,
      "grad_norm": 1.0491237525414794,
      "learning_rate": 7.966475709454928e-06,
      "loss": 0.8592,
      "step": 1094
    },
    {
      "epoch": 0.41426274472713515,
      "grad_norm": 1.0719668981104609,
      "learning_rate": 7.966247587019994e-06,
      "loss": 0.821,
      "step": 1095
    },
    {
      "epoch": 0.4146410668684385,
      "grad_norm": 1.026254989024167,
      "learning_rate": 7.966018694351691e-06,
      "loss": 0.8168,
      "step": 1096
    },
    {
      "epoch": 0.4150193890097418,
      "grad_norm": 1.0321711854785867,
      "learning_rate": 7.96578903149447e-06,
      "loss": 0.8255,
      "step": 1097
    },
    {
      "epoch": 0.4153977111510451,
      "grad_norm": 1.0513898483857722,
      "learning_rate": 7.965558598492929e-06,
      "loss": 0.7748,
      "step": 1098
    },
    {
      "epoch": 0.4157760332923484,
      "grad_norm": 1.0364175851458883,
      "learning_rate": 7.965327395391819e-06,
      "loss": 0.7978,
      "step": 1099
    },
    {
      "epoch": 0.41615435543365176,
      "grad_norm": 0.985307760157813,
      "learning_rate": 7.965095422236038e-06,
      "loss": 0.801,
      "step": 1100
    },
    {
      "epoch": 0.4165326775749551,
      "grad_norm": 1.0813628193591218,
      "learning_rate": 7.964862679070634e-06,
      "loss": 0.845,
      "step": 1101
    },
    {
      "epoch": 0.4169109997162584,
      "grad_norm": 1.0734207809402587,
      "learning_rate": 7.964629165940808e-06,
      "loss": 0.8817,
      "step": 1102
    },
    {
      "epoch": 0.41728932185756173,
      "grad_norm": 1.0599230797124688,
      "learning_rate": 7.964394882891904e-06,
      "loss": 0.8085,
      "step": 1103
    },
    {
      "epoch": 0.417667643998865,
      "grad_norm": 1.078793670107089,
      "learning_rate": 7.96415982996942e-06,
      "loss": 0.7938,
      "step": 1104
    },
    {
      "epoch": 0.41804596614016837,
      "grad_norm": 1.0350357122236093,
      "learning_rate": 7.963924007219002e-06,
      "loss": 0.8207,
      "step": 1105
    },
    {
      "epoch": 0.41842428828147166,
      "grad_norm": 1.041240999715739,
      "learning_rate": 7.963687414686449e-06,
      "loss": 0.7737,
      "step": 1106
    },
    {
      "epoch": 0.418802610422775,
      "grad_norm": 1.1066667842190356,
      "learning_rate": 7.963450052417703e-06,
      "loss": 0.8191,
      "step": 1107
    },
    {
      "epoch": 0.4191809325640783,
      "grad_norm": 1.0866062695241046,
      "learning_rate": 7.963211920458863e-06,
      "loss": 0.8098,
      "step": 1108
    },
    {
      "epoch": 0.41955925470538163,
      "grad_norm": 1.0628974307927237,
      "learning_rate": 7.962973018856169e-06,
      "loss": 0.836,
      "step": 1109
    },
    {
      "epoch": 0.419937576846685,
      "grad_norm": 1.0490148472801595,
      "learning_rate": 7.962733347656018e-06,
      "loss": 0.8074,
      "step": 1110
    },
    {
      "epoch": 0.42031589898798827,
      "grad_norm": 1.056521276681419,
      "learning_rate": 7.962492906904953e-06,
      "loss": 0.7798,
      "step": 1111
    },
    {
      "epoch": 0.4206942211292916,
      "grad_norm": 1.0568484786859005,
      "learning_rate": 7.962251696649665e-06,
      "loss": 0.832,
      "step": 1112
    },
    {
      "epoch": 0.4210725432705949,
      "grad_norm": 1.022548771593414,
      "learning_rate": 7.962009716937e-06,
      "loss": 0.8576,
      "step": 1113
    },
    {
      "epoch": 0.42145086541189825,
      "grad_norm": 1.0376517279626776,
      "learning_rate": 7.961766967813946e-06,
      "loss": 0.7709,
      "step": 1114
    },
    {
      "epoch": 0.42182918755320153,
      "grad_norm": 1.057176802372392,
      "learning_rate": 7.961523449327646e-06,
      "loss": 0.8684,
      "step": 1115
    },
    {
      "epoch": 0.4222075096945049,
      "grad_norm": 1.0278310719203412,
      "learning_rate": 7.961279161525389e-06,
      "loss": 0.7934,
      "step": 1116
    },
    {
      "epoch": 0.42258583183580817,
      "grad_norm": 1.0116937469277474,
      "learning_rate": 7.961034104454618e-06,
      "loss": 0.8288,
      "step": 1117
    },
    {
      "epoch": 0.4229641539771115,
      "grad_norm": 1.0791508367529585,
      "learning_rate": 7.960788278162918e-06,
      "loss": 0.8295,
      "step": 1118
    },
    {
      "epoch": 0.42334247611841486,
      "grad_norm": 1.0482664569638203,
      "learning_rate": 7.960541682698034e-06,
      "loss": 0.8044,
      "step": 1119
    },
    {
      "epoch": 0.42372079825971815,
      "grad_norm": 1.026033507367731,
      "learning_rate": 7.960294318107847e-06,
      "loss": 0.8086,
      "step": 1120
    },
    {
      "epoch": 0.4240991204010215,
      "grad_norm": 1.0713832704640005,
      "learning_rate": 7.960046184440399e-06,
      "loss": 0.8421,
      "step": 1121
    },
    {
      "epoch": 0.4244774425423248,
      "grad_norm": 1.0635267452769637,
      "learning_rate": 7.959797281743876e-06,
      "loss": 0.8452,
      "step": 1122
    },
    {
      "epoch": 0.4248557646836281,
      "grad_norm": 1.046318335512741,
      "learning_rate": 7.959547610066613e-06,
      "loss": 0.7944,
      "step": 1123
    },
    {
      "epoch": 0.4252340868249314,
      "grad_norm": 1.0788089412291229,
      "learning_rate": 7.959297169457097e-06,
      "loss": 0.8338,
      "step": 1124
    },
    {
      "epoch": 0.42561240896623476,
      "grad_norm": 1.0582140885008549,
      "learning_rate": 7.959045959963962e-06,
      "loss": 0.7914,
      "step": 1125
    },
    {
      "epoch": 0.42599073110753805,
      "grad_norm": 1.0773203264262958,
      "learning_rate": 7.958793981635991e-06,
      "loss": 0.8549,
      "step": 1126
    },
    {
      "epoch": 0.4263690532488414,
      "grad_norm": 1.0738918058139102,
      "learning_rate": 7.958541234522119e-06,
      "loss": 0.7836,
      "step": 1127
    },
    {
      "epoch": 0.42674737539014473,
      "grad_norm": 1.0307363548970123,
      "learning_rate": 7.958287718671429e-06,
      "loss": 0.829,
      "step": 1128
    },
    {
      "epoch": 0.427125697531448,
      "grad_norm": 1.0223432647328048,
      "learning_rate": 7.958033434133152e-06,
      "loss": 0.8421,
      "step": 1129
    },
    {
      "epoch": 0.42750401967275137,
      "grad_norm": 1.0402584891579054,
      "learning_rate": 7.95777838095667e-06,
      "loss": 0.7836,
      "step": 1130
    },
    {
      "epoch": 0.42788234181405466,
      "grad_norm": 1.0761841482760737,
      "learning_rate": 7.957522559191514e-06,
      "loss": 0.7933,
      "step": 1131
    },
    {
      "epoch": 0.428260663955358,
      "grad_norm": 1.0391476619745978,
      "learning_rate": 7.957265968887361e-06,
      "loss": 0.811,
      "step": 1132
    },
    {
      "epoch": 0.4286389860966613,
      "grad_norm": 1.026814188051067,
      "learning_rate": 7.957008610094043e-06,
      "loss": 0.8078,
      "step": 1133
    },
    {
      "epoch": 0.42901730823796463,
      "grad_norm": 1.0406330571564124,
      "learning_rate": 7.956750482861538e-06,
      "loss": 0.8359,
      "step": 1134
    },
    {
      "epoch": 0.4293956303792679,
      "grad_norm": 1.0642979501183267,
      "learning_rate": 7.956491587239971e-06,
      "loss": 0.8045,
      "step": 1135
    },
    {
      "epoch": 0.42977395252057127,
      "grad_norm": 1.0393212545559525,
      "learning_rate": 7.956231923279624e-06,
      "loss": 0.8348,
      "step": 1136
    },
    {
      "epoch": 0.4301522746618746,
      "grad_norm": 1.0470124602821342,
      "learning_rate": 7.955971491030917e-06,
      "loss": 0.8148,
      "step": 1137
    },
    {
      "epoch": 0.4305305968031779,
      "grad_norm": 1.0676455383028118,
      "learning_rate": 7.955710290544428e-06,
      "loss": 0.8336,
      "step": 1138
    },
    {
      "epoch": 0.43090891894448125,
      "grad_norm": 1.0721667527067038,
      "learning_rate": 7.955448321870882e-06,
      "loss": 0.831,
      "step": 1139
    },
    {
      "epoch": 0.43128724108578453,
      "grad_norm": 1.064318000094558,
      "learning_rate": 7.955185585061151e-06,
      "loss": 0.8335,
      "step": 1140
    },
    {
      "epoch": 0.4316655632270879,
      "grad_norm": 1.0302584817777816,
      "learning_rate": 7.95492208016626e-06,
      "loss": 0.791,
      "step": 1141
    },
    {
      "epoch": 0.43204388536839117,
      "grad_norm": 1.0256366632375336,
      "learning_rate": 7.954657807237379e-06,
      "loss": 0.8253,
      "step": 1142
    },
    {
      "epoch": 0.4324222075096945,
      "grad_norm": 1.0251051777197329,
      "learning_rate": 7.954392766325828e-06,
      "loss": 0.8223,
      "step": 1143
    },
    {
      "epoch": 0.4328005296509978,
      "grad_norm": 1.045445405795435,
      "learning_rate": 7.954126957483077e-06,
      "loss": 0.7606,
      "step": 1144
    },
    {
      "epoch": 0.43317885179230115,
      "grad_norm": 1.0425200750958303,
      "learning_rate": 7.95386038076075e-06,
      "loss": 0.8537,
      "step": 1145
    },
    {
      "epoch": 0.4335571739336045,
      "grad_norm": 1.0419269404142824,
      "learning_rate": 7.953593036210611e-06,
      "loss": 0.8277,
      "step": 1146
    },
    {
      "epoch": 0.4339354960749078,
      "grad_norm": 1.084574429840746,
      "learning_rate": 7.953324923884578e-06,
      "loss": 0.803,
      "step": 1147
    },
    {
      "epoch": 0.4343138182162111,
      "grad_norm": 1.0419638253671073,
      "learning_rate": 7.953056043834717e-06,
      "loss": 0.8334,
      "step": 1148
    },
    {
      "epoch": 0.4346921403575144,
      "grad_norm": 1.0168098031537844,
      "learning_rate": 7.952786396113248e-06,
      "loss": 0.7849,
      "step": 1149
    },
    {
      "epoch": 0.43507046249881776,
      "grad_norm": 1.0391261866313206,
      "learning_rate": 7.95251598077253e-06,
      "loss": 0.792,
      "step": 1150
    },
    {
      "epoch": 0.43544878464012104,
      "grad_norm": 1.0145928185391837,
      "learning_rate": 7.95224479786508e-06,
      "loss": 0.8069,
      "step": 1151
    },
    {
      "epoch": 0.4358271067814244,
      "grad_norm": 1.0145834983924735,
      "learning_rate": 7.951972847443561e-06,
      "loss": 0.8045,
      "step": 1152
    },
    {
      "epoch": 0.4362054289227277,
      "grad_norm": 1.0385429868897398,
      "learning_rate": 7.951700129560786e-06,
      "loss": 0.8091,
      "step": 1153
    },
    {
      "epoch": 0.436583751064031,
      "grad_norm": 1.0484204110539974,
      "learning_rate": 7.951426644269712e-06,
      "loss": 0.8118,
      "step": 1154
    },
    {
      "epoch": 0.43696207320533437,
      "grad_norm": 1.059201104727976,
      "learning_rate": 7.951152391623452e-06,
      "loss": 0.8335,
      "step": 1155
    },
    {
      "epoch": 0.43734039534663766,
      "grad_norm": 1.0061721443896443,
      "learning_rate": 7.950877371675265e-06,
      "loss": 0.7489,
      "step": 1156
    },
    {
      "epoch": 0.437718717487941,
      "grad_norm": 1.0920232553881484,
      "learning_rate": 7.950601584478557e-06,
      "loss": 0.8012,
      "step": 1157
    },
    {
      "epoch": 0.4380970396292443,
      "grad_norm": 1.0519115174631195,
      "learning_rate": 7.950325030086889e-06,
      "loss": 0.7923,
      "step": 1158
    },
    {
      "epoch": 0.43847536177054763,
      "grad_norm": 1.0813679052789027,
      "learning_rate": 7.950047708553962e-06,
      "loss": 0.8313,
      "step": 1159
    },
    {
      "epoch": 0.4388536839118509,
      "grad_norm": 1.0854599046397435,
      "learning_rate": 7.949769619933634e-06,
      "loss": 0.8616,
      "step": 1160
    },
    {
      "epoch": 0.43923200605315427,
      "grad_norm": 1.1104488658598137,
      "learning_rate": 7.94949076427991e-06,
      "loss": 0.7878,
      "step": 1161
    },
    {
      "epoch": 0.43961032819445756,
      "grad_norm": 1.1346641422155257,
      "learning_rate": 7.949211141646941e-06,
      "loss": 0.8287,
      "step": 1162
    },
    {
      "epoch": 0.4399886503357609,
      "grad_norm": 1.0632008460543734,
      "learning_rate": 7.948930752089029e-06,
      "loss": 0.8278,
      "step": 1163
    },
    {
      "epoch": 0.44036697247706424,
      "grad_norm": 1.0770714736885665,
      "learning_rate": 7.948649595660626e-06,
      "loss": 0.794,
      "step": 1164
    },
    {
      "epoch": 0.44074529461836753,
      "grad_norm": 1.0320296674718166,
      "learning_rate": 7.948367672416329e-06,
      "loss": 0.7973,
      "step": 1165
    },
    {
      "epoch": 0.4411236167596709,
      "grad_norm": 1.037195297637391,
      "learning_rate": 7.94808498241089e-06,
      "loss": 0.8124,
      "step": 1166
    },
    {
      "epoch": 0.44150193890097417,
      "grad_norm": 1.07174382564237,
      "learning_rate": 7.947801525699204e-06,
      "loss": 0.8501,
      "step": 1167
    },
    {
      "epoch": 0.4418802610422775,
      "grad_norm": 1.0423383360705205,
      "learning_rate": 7.947517302336321e-06,
      "loss": 0.8023,
      "step": 1168
    },
    {
      "epoch": 0.4422585831835808,
      "grad_norm": 1.0225149206809994,
      "learning_rate": 7.947232312377431e-06,
      "loss": 0.8082,
      "step": 1169
    },
    {
      "epoch": 0.44263690532488414,
      "grad_norm": 1.0490213514112987,
      "learning_rate": 7.946946555877883e-06,
      "loss": 0.8553,
      "step": 1170
    },
    {
      "epoch": 0.44301522746618743,
      "grad_norm": 1.0565295484573578,
      "learning_rate": 7.946660032893168e-06,
      "loss": 0.8334,
      "step": 1171
    },
    {
      "epoch": 0.4433935496074908,
      "grad_norm": 1.096379949923879,
      "learning_rate": 7.946372743478928e-06,
      "loss": 0.7885,
      "step": 1172
    },
    {
      "epoch": 0.4437718717487941,
      "grad_norm": 1.0635010257740696,
      "learning_rate": 7.946084687690952e-06,
      "loss": 0.867,
      "step": 1173
    },
    {
      "epoch": 0.4441501938900974,
      "grad_norm": 1.046045957242929,
      "learning_rate": 7.945795865585184e-06,
      "loss": 0.7794,
      "step": 1174
    },
    {
      "epoch": 0.44452851603140076,
      "grad_norm": 1.1358219370976814,
      "learning_rate": 7.945506277217707e-06,
      "loss": 0.8048,
      "step": 1175
    },
    {
      "epoch": 0.44490683817270404,
      "grad_norm": 1.0850391747638126,
      "learning_rate": 7.945215922644764e-06,
      "loss": 0.8056,
      "step": 1176
    },
    {
      "epoch": 0.4452851603140074,
      "grad_norm": 1.1532691295951847,
      "learning_rate": 7.944924801922734e-06,
      "loss": 0.8176,
      "step": 1177
    },
    {
      "epoch": 0.4456634824553107,
      "grad_norm": 1.0915907522482993,
      "learning_rate": 7.944632915108158e-06,
      "loss": 0.7994,
      "step": 1178
    },
    {
      "epoch": 0.446041804596614,
      "grad_norm": 1.0282978902411528,
      "learning_rate": 7.944340262257718e-06,
      "loss": 0.8263,
      "step": 1179
    },
    {
      "epoch": 0.4464201267379173,
      "grad_norm": 1.1021567277496518,
      "learning_rate": 7.944046843428244e-06,
      "loss": 0.829,
      "step": 1180
    },
    {
      "epoch": 0.44679844887922066,
      "grad_norm": 1.0694612963890957,
      "learning_rate": 7.94375265867672e-06,
      "loss": 0.8565,
      "step": 1181
    },
    {
      "epoch": 0.447176771020524,
      "grad_norm": 1.0750903881599976,
      "learning_rate": 7.943457708060272e-06,
      "loss": 0.8396,
      "step": 1182
    },
    {
      "epoch": 0.4475550931618273,
      "grad_norm": 1.0453024844416716,
      "learning_rate": 7.943161991636183e-06,
      "loss": 0.8096,
      "step": 1183
    },
    {
      "epoch": 0.44793341530313063,
      "grad_norm": 1.0657511458371332,
      "learning_rate": 7.942865509461879e-06,
      "loss": 0.7964,
      "step": 1184
    },
    {
      "epoch": 0.4483117374444339,
      "grad_norm": 1.0565556737130861,
      "learning_rate": 7.942568261594931e-06,
      "loss": 0.8254,
      "step": 1185
    },
    {
      "epoch": 0.44869005958573727,
      "grad_norm": 1.0811193147116154,
      "learning_rate": 7.942270248093072e-06,
      "loss": 0.8741,
      "step": 1186
    },
    {
      "epoch": 0.44906838172704056,
      "grad_norm": 1.0468093016525521,
      "learning_rate": 7.941971469014168e-06,
      "loss": 0.8379,
      "step": 1187
    },
    {
      "epoch": 0.4494467038683439,
      "grad_norm": 1.06315933336805,
      "learning_rate": 7.941671924416245e-06,
      "loss": 0.8294,
      "step": 1188
    },
    {
      "epoch": 0.4498250260096472,
      "grad_norm": 1.044215685157516,
      "learning_rate": 7.941371614357473e-06,
      "loss": 0.8093,
      "step": 1189
    },
    {
      "epoch": 0.45020334815095053,
      "grad_norm": 1.0172723595558777,
      "learning_rate": 7.941070538896172e-06,
      "loss": 0.777,
      "step": 1190
    },
    {
      "epoch": 0.4505816702922539,
      "grad_norm": 1.0750120304696666,
      "learning_rate": 7.940768698090809e-06,
      "loss": 0.8105,
      "step": 1191
    },
    {
      "epoch": 0.45095999243355717,
      "grad_norm": 1.0440692979176232,
      "learning_rate": 7.940466091999999e-06,
      "loss": 0.8537,
      "step": 1192
    },
    {
      "epoch": 0.4513383145748605,
      "grad_norm": 1.031643540251273,
      "learning_rate": 7.940162720682508e-06,
      "loss": 0.8362,
      "step": 1193
    },
    {
      "epoch": 0.4517166367161638,
      "grad_norm": 1.0019678147671374,
      "learning_rate": 7.939858584197252e-06,
      "loss": 0.8142,
      "step": 1194
    },
    {
      "epoch": 0.45209495885746714,
      "grad_norm": 1.060840824446392,
      "learning_rate": 7.939553682603292e-06,
      "loss": 0.7826,
      "step": 1195
    },
    {
      "epoch": 0.45247328099877043,
      "grad_norm": 1.0604407355830034,
      "learning_rate": 7.939248015959839e-06,
      "loss": 0.8276,
      "step": 1196
    },
    {
      "epoch": 0.4528516031400738,
      "grad_norm": 1.0445689437408072,
      "learning_rate": 7.938941584326251e-06,
      "loss": 0.7994,
      "step": 1197
    },
    {
      "epoch": 0.4528516031400738,
      "eval_loss": 0.8220446705818176,
      "eval_runtime": 26.7666,
      "eval_samples_per_second": 33.064,
      "eval_steps_per_second": 1.046,
      "step": 1197
    },
    {
      "epoch": 0.4528516031400738,
      "eval_bench_accuracy_arc_challenge": 0.2571428571428571,
      "eval_bench_accuracy_hellaswag": 0.225,
      "eval_bench_accuracy_mmlu": 0.23478260869565218,
      "eval_bench_average_accuracy": 0.23897515527950308,
      "eval_bench_loss": 5.286834716796875,
      "eval_bench_total_accuracy": 0.23736263736263735,
      "step": 1197
    },
    {
      "epoch": 0.45322992528137707,
      "grad_norm": 1.0158388274699295,
      "learning_rate": 7.938634387762039e-06,
      "loss": 0.8241,
      "step": 1198
    },
    {
      "epoch": 0.4536082474226804,
      "grad_norm": 1.165515743538843,
      "learning_rate": 7.938326426326857e-06,
      "loss": 0.8526,
      "step": 1199
    },
    {
      "epoch": 0.45398656956398376,
      "grad_norm": 1.0460295029244764,
      "learning_rate": 7.938017700080514e-06,
      "loss": 0.7998,
      "step": 1200
    },
    {
      "epoch": 0.45436489170528704,
      "grad_norm": 1.0837173342344641,
      "learning_rate": 7.93770820908296e-06,
      "loss": 0.7997,
      "step": 1201
    },
    {
      "epoch": 0.4547432138465904,
      "grad_norm": 1.0243169477083875,
      "learning_rate": 7.937397953394296e-06,
      "loss": 0.7991,
      "step": 1202
    },
    {
      "epoch": 0.4551215359878937,
      "grad_norm": 1.0695328376321132,
      "learning_rate": 7.937086933074777e-06,
      "loss": 0.7884,
      "step": 1203
    },
    {
      "epoch": 0.455499858129197,
      "grad_norm": 1.0594971537497897,
      "learning_rate": 7.9367751481848e-06,
      "loss": 0.793,
      "step": 1204
    },
    {
      "epoch": 0.4558781802705003,
      "grad_norm": 1.0554812656920887,
      "learning_rate": 7.936462598784913e-06,
      "loss": 0.8283,
      "step": 1205
    },
    {
      "epoch": 0.45625650241180365,
      "grad_norm": 1.0592140535117982,
      "learning_rate": 7.936149284935811e-06,
      "loss": 0.8323,
      "step": 1206
    },
    {
      "epoch": 0.45663482455310694,
      "grad_norm": 1.026196033728254,
      "learning_rate": 7.935835206698342e-06,
      "loss": 0.8024,
      "step": 1207
    },
    {
      "epoch": 0.4570131466944103,
      "grad_norm": 1.0292414805578125,
      "learning_rate": 7.935520364133494e-06,
      "loss": 0.7895,
      "step": 1208
    },
    {
      "epoch": 0.45739146883571363,
      "grad_norm": 1.0251629830106175,
      "learning_rate": 7.935204757302413e-06,
      "loss": 0.8086,
      "step": 1209
    },
    {
      "epoch": 0.4577697909770169,
      "grad_norm": 1.0757191280770386,
      "learning_rate": 7.934888386266387e-06,
      "loss": 0.8562,
      "step": 1210
    },
    {
      "epoch": 0.45814811311832027,
      "grad_norm": 1.0698429731328996,
      "learning_rate": 7.934571251086853e-06,
      "loss": 0.8518,
      "step": 1211
    },
    {
      "epoch": 0.45852643525962355,
      "grad_norm": 1.074189860162607,
      "learning_rate": 7.934253351825402e-06,
      "loss": 0.7941,
      "step": 1212
    },
    {
      "epoch": 0.4589047574009269,
      "grad_norm": 1.0538357299975836,
      "learning_rate": 7.933934688543764e-06,
      "loss": 0.8394,
      "step": 1213
    },
    {
      "epoch": 0.4592830795422302,
      "grad_norm": 1.0421117329655678,
      "learning_rate": 7.933615261303826e-06,
      "loss": 0.7609,
      "step": 1214
    },
    {
      "epoch": 0.45966140168353353,
      "grad_norm": 1.0391554404129049,
      "learning_rate": 7.933295070167617e-06,
      "loss": 0.8257,
      "step": 1215
    },
    {
      "epoch": 0.4600397238248368,
      "grad_norm": 1.0446148939643307,
      "learning_rate": 7.93297411519732e-06,
      "loss": 0.8104,
      "step": 1216
    },
    {
      "epoch": 0.46041804596614017,
      "grad_norm": 1.0344384305012022,
      "learning_rate": 7.932652396455262e-06,
      "loss": 0.8044,
      "step": 1217
    },
    {
      "epoch": 0.4607963681074435,
      "grad_norm": 1.0733053009164926,
      "learning_rate": 7.932329914003919e-06,
      "loss": 0.8174,
      "step": 1218
    },
    {
      "epoch": 0.4611746902487468,
      "grad_norm": 1.0714389655461505,
      "learning_rate": 7.932006667905917e-06,
      "loss": 0.8255,
      "step": 1219
    },
    {
      "epoch": 0.46155301239005014,
      "grad_norm": 1.028255926596019,
      "learning_rate": 7.93168265822403e-06,
      "loss": 0.8132,
      "step": 1220
    },
    {
      "epoch": 0.46193133453135343,
      "grad_norm": 1.0523184669233379,
      "learning_rate": 7.93135788502118e-06,
      "loss": 0.8428,
      "step": 1221
    },
    {
      "epoch": 0.4623096566726568,
      "grad_norm": 1.0557227987751663,
      "learning_rate": 7.931032348360435e-06,
      "loss": 0.8332,
      "step": 1222
    },
    {
      "epoch": 0.46268797881396007,
      "grad_norm": 1.0609398608821474,
      "learning_rate": 7.930706048305015e-06,
      "loss": 0.8254,
      "step": 1223
    },
    {
      "epoch": 0.4630663009552634,
      "grad_norm": 1.0113270947271225,
      "learning_rate": 7.930378984918286e-06,
      "loss": 0.8335,
      "step": 1224
    },
    {
      "epoch": 0.4634446230965667,
      "grad_norm": 1.0131305243085915,
      "learning_rate": 7.93005115826376e-06,
      "loss": 0.7971,
      "step": 1225
    },
    {
      "epoch": 0.46382294523787004,
      "grad_norm": 1.0569179946125011,
      "learning_rate": 7.929722568405108e-06,
      "loss": 0.8166,
      "step": 1226
    },
    {
      "epoch": 0.4642012673791734,
      "grad_norm": 1.042578338856108,
      "learning_rate": 7.929393215406131e-06,
      "loss": 0.8204,
      "step": 1227
    },
    {
      "epoch": 0.4645795895204767,
      "grad_norm": 1.0748606201799873,
      "learning_rate": 7.929063099330795e-06,
      "loss": 0.8152,
      "step": 1228
    },
    {
      "epoch": 0.46495791166178,
      "grad_norm": 1.0587959397105573,
      "learning_rate": 7.928732220243206e-06,
      "loss": 0.8452,
      "step": 1229
    },
    {
      "epoch": 0.4653362338030833,
      "grad_norm": 1.0914151462165957,
      "learning_rate": 7.928400578207617e-06,
      "loss": 0.8131,
      "step": 1230
    },
    {
      "epoch": 0.46571455594438665,
      "grad_norm": 1.0396349529813116,
      "learning_rate": 7.928068173288438e-06,
      "loss": 0.8113,
      "step": 1231
    },
    {
      "epoch": 0.46609287808568994,
      "grad_norm": 1.0607390438435043,
      "learning_rate": 7.927735005550215e-06,
      "loss": 0.8368,
      "step": 1232
    },
    {
      "epoch": 0.4664712002269933,
      "grad_norm": 1.0290648955783543,
      "learning_rate": 7.927401075057652e-06,
      "loss": 0.808,
      "step": 1233
    },
    {
      "epoch": 0.46684952236829663,
      "grad_norm": 1.0438273949617254,
      "learning_rate": 7.927066381875595e-06,
      "loss": 0.8109,
      "step": 1234
    },
    {
      "epoch": 0.4672278445095999,
      "grad_norm": 1.0492773898494756,
      "learning_rate": 7.926730926069041e-06,
      "loss": 0.8263,
      "step": 1235
    },
    {
      "epoch": 0.46760616665090327,
      "grad_norm": 1.0898615275461312,
      "learning_rate": 7.926394707703133e-06,
      "loss": 0.8417,
      "step": 1236
    },
    {
      "epoch": 0.46798448879220655,
      "grad_norm": 1.0371312864392424,
      "learning_rate": 7.926057726843167e-06,
      "loss": 0.7853,
      "step": 1237
    },
    {
      "epoch": 0.4683628109335099,
      "grad_norm": 1.0311331135840094,
      "learning_rate": 7.925719983554582e-06,
      "loss": 0.8433,
      "step": 1238
    },
    {
      "epoch": 0.4687411330748132,
      "grad_norm": 1.0104501833340858,
      "learning_rate": 7.925381477902967e-06,
      "loss": 0.8246,
      "step": 1239
    },
    {
      "epoch": 0.46911945521611653,
      "grad_norm": 1.033351900846643,
      "learning_rate": 7.92504220995406e-06,
      "loss": 0.801,
      "step": 1240
    },
    {
      "epoch": 0.4694977773574198,
      "grad_norm": 1.0678576004897766,
      "learning_rate": 7.92470217977374e-06,
      "loss": 0.7953,
      "step": 1241
    },
    {
      "epoch": 0.46987609949872317,
      "grad_norm": 1.049154054889686,
      "learning_rate": 7.924361387428047e-06,
      "loss": 0.8034,
      "step": 1242
    },
    {
      "epoch": 0.4702544216400265,
      "grad_norm": 1.0501910151623293,
      "learning_rate": 7.924019832983159e-06,
      "loss": 0.8421,
      "step": 1243
    },
    {
      "epoch": 0.4706327437813298,
      "grad_norm": 1.0265699705882914,
      "learning_rate": 7.923677516505404e-06,
      "loss": 0.7909,
      "step": 1244
    },
    {
      "epoch": 0.47101106592263314,
      "grad_norm": 1.0395280931797561,
      "learning_rate": 7.92333443806126e-06,
      "loss": 0.8283,
      "step": 1245
    },
    {
      "epoch": 0.47138938806393643,
      "grad_norm": 1.006365421675378,
      "learning_rate": 7.922990597717352e-06,
      "loss": 0.8065,
      "step": 1246
    },
    {
      "epoch": 0.4717677102052398,
      "grad_norm": 1.0276097967827926,
      "learning_rate": 7.922645995540453e-06,
      "loss": 0.808,
      "step": 1247
    },
    {
      "epoch": 0.47214603234654307,
      "grad_norm": 0.990132630477362,
      "learning_rate": 7.922300631597482e-06,
      "loss": 0.8006,
      "step": 1248
    },
    {
      "epoch": 0.4725243544878464,
      "grad_norm": 1.047163368722463,
      "learning_rate": 7.921954505955508e-06,
      "loss": 0.7698,
      "step": 1249
    },
    {
      "epoch": 0.4729026766291497,
      "grad_norm": 1.0735335320173403,
      "learning_rate": 7.921607618681748e-06,
      "loss": 0.807,
      "step": 1250
    },
    {
      "epoch": 0.47328099877045304,
      "grad_norm": 1.0461927309518722,
      "learning_rate": 7.921259969843568e-06,
      "loss": 0.8158,
      "step": 1251
    },
    {
      "epoch": 0.4736593209117564,
      "grad_norm": 1.0478396570827158,
      "learning_rate": 7.920911559508476e-06,
      "loss": 0.8386,
      "step": 1252
    },
    {
      "epoch": 0.4740376430530597,
      "grad_norm": 1.0449949458790635,
      "learning_rate": 7.920562387744139e-06,
      "loss": 0.769,
      "step": 1253
    },
    {
      "epoch": 0.474415965194363,
      "grad_norm": 1.0333564168358704,
      "learning_rate": 7.92021245461836e-06,
      "loss": 0.7821,
      "step": 1254
    },
    {
      "epoch": 0.4747942873356663,
      "grad_norm": 1.0160573616445434,
      "learning_rate": 7.919861760199095e-06,
      "loss": 0.8134,
      "step": 1255
    },
    {
      "epoch": 0.47517260947696965,
      "grad_norm": 1.113593494987971,
      "learning_rate": 7.91951030455445e-06,
      "loss": 0.8009,
      "step": 1256
    },
    {
      "epoch": 0.47555093161827294,
      "grad_norm": 1.0583016464392816,
      "learning_rate": 7.919158087752675e-06,
      "loss": 0.8338,
      "step": 1257
    },
    {
      "epoch": 0.4759292537595763,
      "grad_norm": 1.0274177510689335,
      "learning_rate": 7.918805109862172e-06,
      "loss": 0.7701,
      "step": 1258
    },
    {
      "epoch": 0.4763075759008796,
      "grad_norm": 0.9716066799511451,
      "learning_rate": 7.918451370951486e-06,
      "loss": 0.7624,
      "step": 1259
    },
    {
      "epoch": 0.4766858980421829,
      "grad_norm": 1.0417278811736634,
      "learning_rate": 7.91809687108931e-06,
      "loss": 0.8515,
      "step": 1260
    },
    {
      "epoch": 0.47706422018348627,
      "grad_norm": 1.0815755118948713,
      "learning_rate": 7.917741610344492e-06,
      "loss": 0.826,
      "step": 1261
    },
    {
      "epoch": 0.47744254232478955,
      "grad_norm": 0.994132013241377,
      "learning_rate": 7.917385588786019e-06,
      "loss": 0.8112,
      "step": 1262
    },
    {
      "epoch": 0.4778208644660929,
      "grad_norm": 1.0835320028786077,
      "learning_rate": 7.91702880648303e-06,
      "loss": 0.8283,
      "step": 1263
    },
    {
      "epoch": 0.4781991866073962,
      "grad_norm": 1.0656905256693705,
      "learning_rate": 7.916671263504812e-06,
      "loss": 0.8112,
      "step": 1264
    },
    {
      "epoch": 0.47857750874869953,
      "grad_norm": 1.0642356494274112,
      "learning_rate": 7.916312959920796e-06,
      "loss": 0.8187,
      "step": 1265
    },
    {
      "epoch": 0.4789558308900028,
      "grad_norm": 1.1132626507153238,
      "learning_rate": 7.915953895800568e-06,
      "loss": 0.8333,
      "step": 1266
    },
    {
      "epoch": 0.47933415303130616,
      "grad_norm": 1.0964935829984281,
      "learning_rate": 7.915594071213852e-06,
      "loss": 0.8555,
      "step": 1267
    },
    {
      "epoch": 0.47971247517260945,
      "grad_norm": 1.0333616049038883,
      "learning_rate": 7.915233486230529e-06,
      "loss": 0.8002,
      "step": 1268
    },
    {
      "epoch": 0.4800907973139128,
      "grad_norm": 1.0938509373019147,
      "learning_rate": 7.914872140920622e-06,
      "loss": 0.8222,
      "step": 1269
    },
    {
      "epoch": 0.48046911945521614,
      "grad_norm": 1.0500659271586612,
      "learning_rate": 7.914510035354302e-06,
      "loss": 0.7984,
      "step": 1270
    },
    {
      "epoch": 0.48084744159651943,
      "grad_norm": 1.0412102283401292,
      "learning_rate": 7.914147169601891e-06,
      "loss": 0.8178,
      "step": 1271
    },
    {
      "epoch": 0.4812257637378228,
      "grad_norm": 0.9740307673809164,
      "learning_rate": 7.913783543733856e-06,
      "loss": 0.7733,
      "step": 1272
    },
    {
      "epoch": 0.48160408587912606,
      "grad_norm": 1.069013806380367,
      "learning_rate": 7.91341915782081e-06,
      "loss": 0.8355,
      "step": 1273
    },
    {
      "epoch": 0.4819824080204294,
      "grad_norm": 1.020794082270209,
      "learning_rate": 7.913054011933518e-06,
      "loss": 0.8066,
      "step": 1274
    },
    {
      "epoch": 0.4823607301617327,
      "grad_norm": 1.0710477291242142,
      "learning_rate": 7.91268810614289e-06,
      "loss": 0.822,
      "step": 1275
    },
    {
      "epoch": 0.48273905230303604,
      "grad_norm": 1.021706668635038,
      "learning_rate": 7.912321440519982e-06,
      "loss": 0.8393,
      "step": 1276
    },
    {
      "epoch": 0.48311737444433933,
      "grad_norm": 1.0381317605620335,
      "learning_rate": 7.911954015136e-06,
      "loss": 0.8001,
      "step": 1277
    },
    {
      "epoch": 0.4834956965856427,
      "grad_norm": 1.0491889355455017,
      "learning_rate": 7.9115858300623e-06,
      "loss": 0.8424,
      "step": 1278
    },
    {
      "epoch": 0.483874018726946,
      "grad_norm": 1.027527176211447,
      "learning_rate": 7.911216885370377e-06,
      "loss": 0.7934,
      "step": 1279
    },
    {
      "epoch": 0.4842523408682493,
      "grad_norm": 1.0241159829134092,
      "learning_rate": 7.910847181131883e-06,
      "loss": 0.8632,
      "step": 1280
    },
    {
      "epoch": 0.48463066300955265,
      "grad_norm": 1.050840821158761,
      "learning_rate": 7.910476717418613e-06,
      "loss": 0.8341,
      "step": 1281
    },
    {
      "epoch": 0.48500898515085594,
      "grad_norm": 1.0312020050809032,
      "learning_rate": 7.910105494302508e-06,
      "loss": 0.8124,
      "step": 1282
    },
    {
      "epoch": 0.4853873072921593,
      "grad_norm": 1.058895959078315,
      "learning_rate": 7.90973351185566e-06,
      "loss": 0.8179,
      "step": 1283
    },
    {
      "epoch": 0.4857656294334626,
      "grad_norm": 1.0442278097312725,
      "learning_rate": 7.909360770150308e-06,
      "loss": 0.8251,
      "step": 1284
    },
    {
      "epoch": 0.4861439515747659,
      "grad_norm": 1.0685857966408454,
      "learning_rate": 7.908987269258834e-06,
      "loss": 0.8506,
      "step": 1285
    },
    {
      "epoch": 0.4865222737160692,
      "grad_norm": 1.1080322429830538,
      "learning_rate": 7.908613009253774e-06,
      "loss": 0.825,
      "step": 1286
    },
    {
      "epoch": 0.48690059585737255,
      "grad_norm": 1.0340810208381146,
      "learning_rate": 7.908237990207805e-06,
      "loss": 0.7916,
      "step": 1287
    },
    {
      "epoch": 0.4872789179986759,
      "grad_norm": 1.0420175323828418,
      "learning_rate": 7.907862212193758e-06,
      "loss": 0.822,
      "step": 1288
    },
    {
      "epoch": 0.4876572401399792,
      "grad_norm": 1.0199603577395158,
      "learning_rate": 7.907485675284604e-06,
      "loss": 0.8082,
      "step": 1289
    },
    {
      "epoch": 0.48803556228128253,
      "grad_norm": 1.0282638290755661,
      "learning_rate": 7.907108379553467e-06,
      "loss": 0.8308,
      "step": 1290
    },
    {
      "epoch": 0.4884138844225858,
      "grad_norm": 1.0699234725043125,
      "learning_rate": 7.90673032507362e-06,
      "loss": 0.809,
      "step": 1291
    },
    {
      "epoch": 0.48879220656388916,
      "grad_norm": 1.0537759557907738,
      "learning_rate": 7.906351511918477e-06,
      "loss": 0.8244,
      "step": 1292
    },
    {
      "epoch": 0.48917052870519245,
      "grad_norm": 1.0220073412783424,
      "learning_rate": 7.905971940161603e-06,
      "loss": 0.8313,
      "step": 1293
    },
    {
      "epoch": 0.4895488508464958,
      "grad_norm": 1.0751723455689177,
      "learning_rate": 7.905591609876708e-06,
      "loss": 0.8373,
      "step": 1294
    },
    {
      "epoch": 0.4899271729877991,
      "grad_norm": 1.0162597179792359,
      "learning_rate": 7.905210521137654e-06,
      "loss": 0.8142,
      "step": 1295
    },
    {
      "epoch": 0.49030549512910243,
      "grad_norm": 1.0733965520897772,
      "learning_rate": 7.904828674018446e-06,
      "loss": 0.8325,
      "step": 1296
    },
    {
      "epoch": 0.4906838172704058,
      "grad_norm": 1.0275444217813758,
      "learning_rate": 7.904446068593236e-06,
      "loss": 0.812,
      "step": 1297
    },
    {
      "epoch": 0.49106213941170906,
      "grad_norm": 1.0074767810899912,
      "learning_rate": 7.904062704936325e-06,
      "loss": 0.8072,
      "step": 1298
    },
    {
      "epoch": 0.4914404615530124,
      "grad_norm": 1.0390065488319102,
      "learning_rate": 7.903678583122165e-06,
      "loss": 0.8008,
      "step": 1299
    },
    {
      "epoch": 0.4918187836943157,
      "grad_norm": 0.9868065507715447,
      "learning_rate": 7.903293703225345e-06,
      "loss": 0.816,
      "step": 1300
    },
    {
      "epoch": 0.49219710583561904,
      "grad_norm": 1.0553901493428994,
      "learning_rate": 7.902908065320615e-06,
      "loss": 0.835,
      "step": 1301
    },
    {
      "epoch": 0.49257542797692233,
      "grad_norm": 1.0153758567731757,
      "learning_rate": 7.902521669482858e-06,
      "loss": 0.7622,
      "step": 1302
    },
    {
      "epoch": 0.4929537501182257,
      "grad_norm": 1.039524643535567,
      "learning_rate": 7.902134515787115e-06,
      "loss": 0.8219,
      "step": 1303
    },
    {
      "epoch": 0.49333207225952896,
      "grad_norm": 1.0193352620631986,
      "learning_rate": 7.901746604308567e-06,
      "loss": 0.7745,
      "step": 1304
    },
    {
      "epoch": 0.4937103944008323,
      "grad_norm": 1.0237247993056149,
      "learning_rate": 7.901357935122549e-06,
      "loss": 0.7918,
      "step": 1305
    },
    {
      "epoch": 0.49408871654213565,
      "grad_norm": 1.018379832975063,
      "learning_rate": 7.900968508304535e-06,
      "loss": 0.8111,
      "step": 1306
    },
    {
      "epoch": 0.49446703868343894,
      "grad_norm": 1.116472085720671,
      "learning_rate": 7.900578323930154e-06,
      "loss": 0.7942,
      "step": 1307
    },
    {
      "epoch": 0.4948453608247423,
      "grad_norm": 1.0587349903275387,
      "learning_rate": 7.900187382075179e-06,
      "loss": 0.7992,
      "step": 1308
    },
    {
      "epoch": 0.4952236829660456,
      "grad_norm": 1.0058048161089288,
      "learning_rate": 7.899795682815525e-06,
      "loss": 0.7812,
      "step": 1309
    },
    {
      "epoch": 0.4956020051073489,
      "grad_norm": 1.0466221891639538,
      "learning_rate": 7.899403226227265e-06,
      "loss": 0.8172,
      "step": 1310
    },
    {
      "epoch": 0.4959803272486522,
      "grad_norm": 1.021072365800396,
      "learning_rate": 7.899010012386609e-06,
      "loss": 0.7917,
      "step": 1311
    },
    {
      "epoch": 0.49635864938995555,
      "grad_norm": 1.0276680529834,
      "learning_rate": 7.898616041369919e-06,
      "loss": 0.806,
      "step": 1312
    },
    {
      "epoch": 0.49673697153125884,
      "grad_norm": 1.0080935461504426,
      "learning_rate": 7.898221313253703e-06,
      "loss": 0.7839,
      "step": 1313
    },
    {
      "epoch": 0.4971152936725622,
      "grad_norm": 1.045973831410194,
      "learning_rate": 7.897825828114615e-06,
      "loss": 0.8396,
      "step": 1314
    },
    {
      "epoch": 0.49749361581386553,
      "grad_norm": 1.0314643332651545,
      "learning_rate": 7.897429586029458e-06,
      "loss": 0.845,
      "step": 1315
    },
    {
      "epoch": 0.4978719379551688,
      "grad_norm": 1.0214806015923183,
      "learning_rate": 7.897032587075181e-06,
      "loss": 0.8178,
      "step": 1316
    },
    {
      "epoch": 0.49825026009647216,
      "grad_norm": 1.0739578792818636,
      "learning_rate": 7.896634831328881e-06,
      "loss": 0.803,
      "step": 1317
    },
    {
      "epoch": 0.49862858223777545,
      "grad_norm": 1.1075886688146952,
      "learning_rate": 7.8962363188678e-06,
      "loss": 0.7869,
      "step": 1318
    },
    {
      "epoch": 0.4990069043790788,
      "grad_norm": 1.0212558702854573,
      "learning_rate": 7.895837049769326e-06,
      "loss": 0.8181,
      "step": 1319
    },
    {
      "epoch": 0.4993852265203821,
      "grad_norm": 1.0781905029615857,
      "learning_rate": 7.895437024111e-06,
      "loss": 0.8469,
      "step": 1320
    },
    {
      "epoch": 0.49976354866168543,
      "grad_norm": 1.0970231389243905,
      "learning_rate": 7.895036241970501e-06,
      "loss": 0.8268,
      "step": 1321
    },
    {
      "epoch": 0.5001418708029888,
      "grad_norm": 0.9979190002347814,
      "learning_rate": 7.894634703425664e-06,
      "loss": 0.82,
      "step": 1322
    },
    {
      "epoch": 0.5005201929442921,
      "grad_norm": 1.011211832148979,
      "learning_rate": 7.894232408554466e-06,
      "loss": 0.7793,
      "step": 1323
    },
    {
      "epoch": 0.5008985150855954,
      "grad_norm": 1.058479892971991,
      "learning_rate": 7.893829357435027e-06,
      "loss": 0.8557,
      "step": 1324
    },
    {
      "epoch": 0.5012768372268988,
      "grad_norm": 1.067675718676119,
      "learning_rate": 7.893425550145624e-06,
      "loss": 0.8075,
      "step": 1325
    },
    {
      "epoch": 0.501655159368202,
      "grad_norm": 1.0748158502027498,
      "learning_rate": 7.893020986764671e-06,
      "loss": 0.8217,
      "step": 1326
    },
    {
      "epoch": 0.5020334815095053,
      "grad_norm": 1.0371866926324267,
      "learning_rate": 7.892615667370736e-06,
      "loss": 0.786,
      "step": 1327
    },
    {
      "epoch": 0.5024118036508086,
      "grad_norm": 1.0227845872267822,
      "learning_rate": 7.892209592042528e-06,
      "loss": 0.851,
      "step": 1328
    },
    {
      "epoch": 0.502790125792112,
      "grad_norm": 1.053385595871815,
      "learning_rate": 7.891802760858909e-06,
      "loss": 0.8131,
      "step": 1329
    },
    {
      "epoch": 0.5031684479334153,
      "grad_norm": 1.0858668827753901,
      "learning_rate": 7.89139517389888e-06,
      "loss": 0.8178,
      "step": 1330
    },
    {
      "epoch": 0.5031684479334153,
      "eval_loss": 0.8155249357223511,
      "eval_runtime": 26.9154,
      "eval_samples_per_second": 32.881,
      "eval_steps_per_second": 1.04,
      "step": 1330
    },
    {
      "epoch": 0.5031684479334153,
      "eval_bench_accuracy_arc_challenge": 0.22857142857142856,
      "eval_bench_accuracy_hellaswag": 0.255,
      "eval_bench_accuracy_mmlu": 0.2782608695652174,
      "eval_bench_average_accuracy": 0.253944099378882,
      "eval_bench_loss": 5.252888461999726,
      "eval_bench_total_accuracy": 0.25274725274725274,
      "step": 1330
    },
    {
      "epoch": 0.5035467700747186,
      "grad_norm": 1.0418553186067219,
      "learning_rate": 7.890986831241598e-06,
      "loss": 0.7842,
      "step": 1331
    },
    {
      "epoch": 0.503925092216022,
      "grad_norm": 1.027783298562076,
      "learning_rate": 7.890577732966358e-06,
      "loss": 0.7925,
      "step": 1332
    },
    {
      "epoch": 0.5043034143573253,
      "grad_norm": 1.0399175596382164,
      "learning_rate": 7.890167879152609e-06,
      "loss": 0.8595,
      "step": 1333
    },
    {
      "epoch": 0.5046817364986286,
      "grad_norm": 1.0324556300456535,
      "learning_rate": 7.88975726987994e-06,
      "loss": 0.8402,
      "step": 1334
    },
    {
      "epoch": 0.5050600586399319,
      "grad_norm": 1.0669911175427689,
      "learning_rate": 7.889345905228092e-06,
      "loss": 0.8132,
      "step": 1335
    },
    {
      "epoch": 0.5054383807812353,
      "grad_norm": 1.07761249948945,
      "learning_rate": 7.888933785276951e-06,
      "loss": 0.8122,
      "step": 1336
    },
    {
      "epoch": 0.5058167029225386,
      "grad_norm": 1.0315582279231172,
      "learning_rate": 7.888520910106548e-06,
      "loss": 0.8063,
      "step": 1337
    },
    {
      "epoch": 0.5061950250638418,
      "grad_norm": 1.028383480686869,
      "learning_rate": 7.888107279797064e-06,
      "loss": 0.8115,
      "step": 1338
    },
    {
      "epoch": 0.5065733472051451,
      "grad_norm": 1.1084019164549017,
      "learning_rate": 7.887692894428822e-06,
      "loss": 0.8586,
      "step": 1339
    },
    {
      "epoch": 0.5069516693464485,
      "grad_norm": 1.0246273881178,
      "learning_rate": 7.887277754082298e-06,
      "loss": 0.7968,
      "step": 1340
    },
    {
      "epoch": 0.5073299914877518,
      "grad_norm": 1.0537510788483588,
      "learning_rate": 7.886861858838109e-06,
      "loss": 0.7794,
      "step": 1341
    },
    {
      "epoch": 0.5077083136290551,
      "grad_norm": 1.025698434441957,
      "learning_rate": 7.88644520877702e-06,
      "loss": 0.7983,
      "step": 1342
    },
    {
      "epoch": 0.5080866357703585,
      "grad_norm": 1.0480085776508747,
      "learning_rate": 7.886027803979946e-06,
      "loss": 0.8016,
      "step": 1343
    },
    {
      "epoch": 0.5084649579116618,
      "grad_norm": 1.0461816558010573,
      "learning_rate": 7.885609644527943e-06,
      "loss": 0.8189,
      "step": 1344
    },
    {
      "epoch": 0.5088432800529651,
      "grad_norm": 0.993326821555258,
      "learning_rate": 7.885190730502215e-06,
      "loss": 0.7957,
      "step": 1345
    },
    {
      "epoch": 0.5092216021942684,
      "grad_norm": 1.0745480385635238,
      "learning_rate": 7.884771061984118e-06,
      "loss": 0.8019,
      "step": 1346
    },
    {
      "epoch": 0.5095999243355718,
      "grad_norm": 1.0384805298302937,
      "learning_rate": 7.884350639055147e-06,
      "loss": 0.8395,
      "step": 1347
    },
    {
      "epoch": 0.5099782464768751,
      "grad_norm": 1.020760024227472,
      "learning_rate": 7.883929461796949e-06,
      "loss": 0.7919,
      "step": 1348
    },
    {
      "epoch": 0.5103565686181784,
      "grad_norm": 1.0426222802625165,
      "learning_rate": 7.883507530291315e-06,
      "loss": 0.8133,
      "step": 1349
    },
    {
      "epoch": 0.5107348907594818,
      "grad_norm": 1.0236106718012763,
      "learning_rate": 7.883084844620181e-06,
      "loss": 0.7525,
      "step": 1350
    },
    {
      "epoch": 0.511113212900785,
      "grad_norm": 1.0752909757757687,
      "learning_rate": 7.882661404865635e-06,
      "loss": 0.8363,
      "step": 1351
    },
    {
      "epoch": 0.5114915350420883,
      "grad_norm": 1.0496011841679878,
      "learning_rate": 7.882237211109903e-06,
      "loss": 0.825,
      "step": 1352
    },
    {
      "epoch": 0.5118698571833916,
      "grad_norm": 1.052905405929199,
      "learning_rate": 7.881812263435365e-06,
      "loss": 0.7808,
      "step": 1353
    },
    {
      "epoch": 0.512248179324695,
      "grad_norm": 1.0383149467870931,
      "learning_rate": 7.881386561924544e-06,
      "loss": 0.8258,
      "step": 1354
    },
    {
      "epoch": 0.5126265014659983,
      "grad_norm": 1.0142846574710827,
      "learning_rate": 7.880960106660112e-06,
      "loss": 0.832,
      "step": 1355
    },
    {
      "epoch": 0.5130048236073016,
      "grad_norm": 1.0162105056610324,
      "learning_rate": 7.880532897724882e-06,
      "loss": 0.8271,
      "step": 1356
    },
    {
      "epoch": 0.5133831457486049,
      "grad_norm": 1.0111397828819904,
      "learning_rate": 7.880104935201817e-06,
      "loss": 0.7716,
      "step": 1357
    },
    {
      "epoch": 0.5137614678899083,
      "grad_norm": 1.0387312593547113,
      "learning_rate": 7.879676219174028e-06,
      "loss": 0.7856,
      "step": 1358
    },
    {
      "epoch": 0.5141397900312116,
      "grad_norm": 1.0976300200992746,
      "learning_rate": 7.879246749724769e-06,
      "loss": 0.8214,
      "step": 1359
    },
    {
      "epoch": 0.5145181121725149,
      "grad_norm": 1.0225148649560976,
      "learning_rate": 7.878816526937443e-06,
      "loss": 0.8154,
      "step": 1360
    },
    {
      "epoch": 0.5148964343138183,
      "grad_norm": 1.0564511900500775,
      "learning_rate": 7.878385550895597e-06,
      "loss": 0.7706,
      "step": 1361
    },
    {
      "epoch": 0.5152747564551216,
      "grad_norm": 1.065194818654382,
      "learning_rate": 7.877953821682924e-06,
      "loss": 0.7806,
      "step": 1362
    },
    {
      "epoch": 0.5156530785964248,
      "grad_norm": 1.0318627975030588,
      "learning_rate": 7.877521339383267e-06,
      "loss": 0.8317,
      "step": 1363
    },
    {
      "epoch": 0.5160314007377281,
      "grad_norm": 1.0660496042471788,
      "learning_rate": 7.877088104080612e-06,
      "loss": 0.8116,
      "step": 1364
    },
    {
      "epoch": 0.5164097228790315,
      "grad_norm": 1.0084811396262128,
      "learning_rate": 7.87665411585909e-06,
      "loss": 0.8233,
      "step": 1365
    },
    {
      "epoch": 0.5167880450203348,
      "grad_norm": 1.0061856631615549,
      "learning_rate": 7.876219374802983e-06,
      "loss": 0.8226,
      "step": 1366
    },
    {
      "epoch": 0.5171663671616381,
      "grad_norm": 0.9962092519447693,
      "learning_rate": 7.875783880996717e-06,
      "loss": 0.7949,
      "step": 1367
    },
    {
      "epoch": 0.5175446893029415,
      "grad_norm": 1.0320181154699064,
      "learning_rate": 7.87534763452486e-06,
      "loss": 0.8078,
      "step": 1368
    },
    {
      "epoch": 0.5179230114442448,
      "grad_norm": 1.0366220904643662,
      "learning_rate": 7.87491063547213e-06,
      "loss": 0.7915,
      "step": 1369
    },
    {
      "epoch": 0.5183013335855481,
      "grad_norm": 0.9990483570523689,
      "learning_rate": 7.874472883923396e-06,
      "loss": 0.7962,
      "step": 1370
    },
    {
      "epoch": 0.5186796557268514,
      "grad_norm": 1.072712099895109,
      "learning_rate": 7.874034379963663e-06,
      "loss": 0.8201,
      "step": 1371
    },
    {
      "epoch": 0.5190579778681548,
      "grad_norm": 1.0469398611990606,
      "learning_rate": 7.873595123678088e-06,
      "loss": 0.8295,
      "step": 1372
    },
    {
      "epoch": 0.5194363000094581,
      "grad_norm": 1.0258466230718022,
      "learning_rate": 7.873155115151976e-06,
      "loss": 0.7962,
      "step": 1373
    },
    {
      "epoch": 0.5198146221507614,
      "grad_norm": 1.0150744464405486,
      "learning_rate": 7.872714354470771e-06,
      "loss": 0.8091,
      "step": 1374
    },
    {
      "epoch": 0.5201929442920646,
      "grad_norm": 1.0877815460579687,
      "learning_rate": 7.87227284172007e-06,
      "loss": 0.8449,
      "step": 1375
    },
    {
      "epoch": 0.520571266433368,
      "grad_norm": 0.9989012315656198,
      "learning_rate": 7.871830576985613e-06,
      "loss": 0.7904,
      "step": 1376
    },
    {
      "epoch": 0.5209495885746713,
      "grad_norm": 1.0281663493359343,
      "learning_rate": 7.871387560353288e-06,
      "loss": 0.8235,
      "step": 1377
    },
    {
      "epoch": 0.5213279107159746,
      "grad_norm": 1.013255314723829,
      "learning_rate": 7.870943791909124e-06,
      "loss": 0.8137,
      "step": 1378
    },
    {
      "epoch": 0.521706232857278,
      "grad_norm": 1.0404202767535178,
      "learning_rate": 7.870499271739304e-06,
      "loss": 0.8331,
      "step": 1379
    },
    {
      "epoch": 0.5220845549985813,
      "grad_norm": 1.0008843854289766,
      "learning_rate": 7.870053999930149e-06,
      "loss": 0.7985,
      "step": 1380
    },
    {
      "epoch": 0.5224628771398846,
      "grad_norm": 1.115907702208107,
      "learning_rate": 7.869607976568131e-06,
      "loss": 0.8444,
      "step": 1381
    },
    {
      "epoch": 0.5228411992811879,
      "grad_norm": 1.0499698053880258,
      "learning_rate": 7.869161201739866e-06,
      "loss": 0.7875,
      "step": 1382
    },
    {
      "epoch": 0.5232195214224913,
      "grad_norm": 1.0086891227734494,
      "learning_rate": 7.868713675532115e-06,
      "loss": 0.7981,
      "step": 1383
    },
    {
      "epoch": 0.5235978435637946,
      "grad_norm": 1.0416968121742411,
      "learning_rate": 7.868265398031788e-06,
      "loss": 0.8082,
      "step": 1384
    },
    {
      "epoch": 0.5239761657050979,
      "grad_norm": 0.9956171233693443,
      "learning_rate": 7.86781636932594e-06,
      "loss": 0.8497,
      "step": 1385
    },
    {
      "epoch": 0.5243544878464013,
      "grad_norm": 1.0366372693126888,
      "learning_rate": 7.867366589501767e-06,
      "loss": 0.7878,
      "step": 1386
    },
    {
      "epoch": 0.5247328099877046,
      "grad_norm": 1.0252929211171813,
      "learning_rate": 7.86691605864662e-06,
      "loss": 0.8254,
      "step": 1387
    },
    {
      "epoch": 0.5251111321290078,
      "grad_norm": 1.0349722097719734,
      "learning_rate": 7.866464776847987e-06,
      "loss": 0.8092,
      "step": 1388
    },
    {
      "epoch": 0.5254894542703111,
      "grad_norm": 1.0775801625166288,
      "learning_rate": 7.866012744193508e-06,
      "loss": 0.8032,
      "step": 1389
    },
    {
      "epoch": 0.5258677764116145,
      "grad_norm": 1.025158242287074,
      "learning_rate": 7.865559960770964e-06,
      "loss": 0.7777,
      "step": 1390
    },
    {
      "epoch": 0.5262460985529178,
      "grad_norm": 1.0261907345479138,
      "learning_rate": 7.865106426668287e-06,
      "loss": 0.7656,
      "step": 1391
    },
    {
      "epoch": 0.5266244206942211,
      "grad_norm": 1.0119949142526334,
      "learning_rate": 7.864652141973549e-06,
      "loss": 0.817,
      "step": 1392
    },
    {
      "epoch": 0.5270027428355244,
      "grad_norm": 0.9887922738590984,
      "learning_rate": 7.864197106774973e-06,
      "loss": 0.7871,
      "step": 1393
    },
    {
      "epoch": 0.5273810649768278,
      "grad_norm": 1.0473369889166892,
      "learning_rate": 7.863741321160924e-06,
      "loss": 0.7885,
      "step": 1394
    },
    {
      "epoch": 0.5277593871181311,
      "grad_norm": 1.021975230127612,
      "learning_rate": 7.863284785219916e-06,
      "loss": 0.7862,
      "step": 1395
    },
    {
      "epoch": 0.5281377092594344,
      "grad_norm": 1.0624890686836679,
      "learning_rate": 7.862827499040604e-06,
      "loss": 0.8445,
      "step": 1396
    },
    {
      "epoch": 0.5285160314007378,
      "grad_norm": 1.0159701351719927,
      "learning_rate": 7.862369462711795e-06,
      "loss": 0.8084,
      "step": 1397
    },
    {
      "epoch": 0.5288943535420411,
      "grad_norm": 1.0307854419947649,
      "learning_rate": 7.861910676322434e-06,
      "loss": 0.7957,
      "step": 1398
    },
    {
      "epoch": 0.5292726756833444,
      "grad_norm": 1.088274510577477,
      "learning_rate": 7.861451139961622e-06,
      "loss": 0.8134,
      "step": 1399
    },
    {
      "epoch": 0.5296509978246476,
      "grad_norm": 1.1610468987478788,
      "learning_rate": 7.860990853718593e-06,
      "loss": 0.7706,
      "step": 1400
    },
    {
      "epoch": 0.530029319965951,
      "grad_norm": 1.0709949089292212,
      "learning_rate": 7.860529817682737e-06,
      "loss": 0.839,
      "step": 1401
    },
    {
      "epoch": 0.5304076421072543,
      "grad_norm": 1.0641189768424455,
      "learning_rate": 7.860068031943586e-06,
      "loss": 0.7794,
      "step": 1402
    },
    {
      "epoch": 0.5307859642485576,
      "grad_norm": 1.0425801957230985,
      "learning_rate": 7.859605496590816e-06,
      "loss": 0.7982,
      "step": 1403
    },
    {
      "epoch": 0.531164286389861,
      "grad_norm": 1.0561738214600724,
      "learning_rate": 7.859142211714251e-06,
      "loss": 0.8298,
      "step": 1404
    },
    {
      "epoch": 0.5315426085311643,
      "grad_norm": 1.0034598628819673,
      "learning_rate": 7.858678177403859e-06,
      "loss": 0.842,
      "step": 1405
    },
    {
      "epoch": 0.5319209306724676,
      "grad_norm": 1.0174154185360578,
      "learning_rate": 7.858213393749755e-06,
      "loss": 0.8024,
      "step": 1406
    },
    {
      "epoch": 0.5322992528137709,
      "grad_norm": 1.002603647328177,
      "learning_rate": 7.857747860842196e-06,
      "loss": 0.8186,
      "step": 1407
    },
    {
      "epoch": 0.5326775749550743,
      "grad_norm": 1.0285530234043798,
      "learning_rate": 7.857281578771589e-06,
      "loss": 0.8156,
      "step": 1408
    },
    {
      "epoch": 0.5330558970963776,
      "grad_norm": 1.02768116084931,
      "learning_rate": 7.856814547628485e-06,
      "loss": 0.8165,
      "step": 1409
    },
    {
      "epoch": 0.5334342192376809,
      "grad_norm": 1.1031829681313992,
      "learning_rate": 7.85634676750358e-06,
      "loss": 0.8579,
      "step": 1410
    },
    {
      "epoch": 0.5338125413789842,
      "grad_norm": 1.027426941839886,
      "learning_rate": 7.855878238487714e-06,
      "loss": 0.7945,
      "step": 1411
    },
    {
      "epoch": 0.5341908635202876,
      "grad_norm": 1.0561714395136612,
      "learning_rate": 7.855408960671875e-06,
      "loss": 0.7641,
      "step": 1412
    },
    {
      "epoch": 0.5345691856615908,
      "grad_norm": 1.090238437190781,
      "learning_rate": 7.854938934147195e-06,
      "loss": 0.8063,
      "step": 1413
    },
    {
      "epoch": 0.5349475078028941,
      "grad_norm": 1.2074317498906901,
      "learning_rate": 7.854468159004952e-06,
      "loss": 0.7921,
      "step": 1414
    },
    {
      "epoch": 0.5353258299441975,
      "grad_norm": 1.0749934432108652,
      "learning_rate": 7.85399663533657e-06,
      "loss": 0.8165,
      "step": 1415
    },
    {
      "epoch": 0.5357041520855008,
      "grad_norm": 1.0472554586470812,
      "learning_rate": 7.853524363233614e-06,
      "loss": 0.8232,
      "step": 1416
    },
    {
      "epoch": 0.5360824742268041,
      "grad_norm": 1.0321608082815132,
      "learning_rate": 7.853051342787802e-06,
      "loss": 0.8207,
      "step": 1417
    },
    {
      "epoch": 0.5364607963681074,
      "grad_norm": 1.010186032847584,
      "learning_rate": 7.852577574090992e-06,
      "loss": 0.7875,
      "step": 1418
    },
    {
      "epoch": 0.5368391185094108,
      "grad_norm": 1.0585550633979846,
      "learning_rate": 7.852103057235187e-06,
      "loss": 0.7872,
      "step": 1419
    },
    {
      "epoch": 0.5372174406507141,
      "grad_norm": 1.0424950696245099,
      "learning_rate": 7.851627792312539e-06,
      "loss": 0.7871,
      "step": 1420
    },
    {
      "epoch": 0.5375957627920174,
      "grad_norm": 1.0123853847303819,
      "learning_rate": 7.85115177941534e-06,
      "loss": 0.7915,
      "step": 1421
    },
    {
      "epoch": 0.5379740849333208,
      "grad_norm": 1.0357173714573609,
      "learning_rate": 7.850675018636034e-06,
      "loss": 0.7829,
      "step": 1422
    },
    {
      "epoch": 0.5383524070746241,
      "grad_norm": 1.4395615442604752,
      "learning_rate": 7.850197510067203e-06,
      "loss": 0.8255,
      "step": 1423
    },
    {
      "epoch": 0.5387307292159274,
      "grad_norm": 1.0121918462650672,
      "learning_rate": 7.849719253801578e-06,
      "loss": 0.8553,
      "step": 1424
    },
    {
      "epoch": 0.5391090513572306,
      "grad_norm": 0.9837030660961567,
      "learning_rate": 7.849240249932039e-06,
      "loss": 0.7586,
      "step": 1425
    },
    {
      "epoch": 0.539487373498534,
      "grad_norm": 1.018520798880126,
      "learning_rate": 7.848760498551603e-06,
      "loss": 0.8266,
      "step": 1426
    },
    {
      "epoch": 0.5398656956398373,
      "grad_norm": 1.0215594842474691,
      "learning_rate": 7.848279999753438e-06,
      "loss": 0.8115,
      "step": 1427
    },
    {
      "epoch": 0.5402440177811406,
      "grad_norm": 1.0166660418304827,
      "learning_rate": 7.847798753630854e-06,
      "loss": 0.7822,
      "step": 1428
    },
    {
      "epoch": 0.5406223399224439,
      "grad_norm": 1.0027140748494623,
      "learning_rate": 7.84731676027731e-06,
      "loss": 0.8033,
      "step": 1429
    },
    {
      "epoch": 0.5410006620637473,
      "grad_norm": 1.0627188785846766,
      "learning_rate": 7.846834019786404e-06,
      "loss": 0.8265,
      "step": 1430
    },
    {
      "epoch": 0.5413789842050506,
      "grad_norm": 1.0264202021796238,
      "learning_rate": 7.846350532251887e-06,
      "loss": 0.8109,
      "step": 1431
    },
    {
      "epoch": 0.5417573063463539,
      "grad_norm": 1.0850130197305035,
      "learning_rate": 7.845866297767647e-06,
      "loss": 0.8166,
      "step": 1432
    },
    {
      "epoch": 0.5421356284876573,
      "grad_norm": 1.0443803197744415,
      "learning_rate": 7.845381316427724e-06,
      "loss": 0.8134,
      "step": 1433
    },
    {
      "epoch": 0.5425139506289606,
      "grad_norm": 1.0216121613789444,
      "learning_rate": 7.844895588326298e-06,
      "loss": 0.8248,
      "step": 1434
    },
    {
      "epoch": 0.5428922727702639,
      "grad_norm": 1.0528680390786613,
      "learning_rate": 7.844409113557698e-06,
      "loss": 0.8306,
      "step": 1435
    },
    {
      "epoch": 0.5432705949115672,
      "grad_norm": 1.056376944389717,
      "learning_rate": 7.843921892216392e-06,
      "loss": 0.7733,
      "step": 1436
    },
    {
      "epoch": 0.5436489170528706,
      "grad_norm": 1.0054617166141346,
      "learning_rate": 7.843433924397002e-06,
      "loss": 0.7937,
      "step": 1437
    },
    {
      "epoch": 0.5440272391941738,
      "grad_norm": 1.0047703505362153,
      "learning_rate": 7.842945210194286e-06,
      "loss": 0.7923,
      "step": 1438
    },
    {
      "epoch": 0.5444055613354771,
      "grad_norm": 1.0096110719940172,
      "learning_rate": 7.842455749703151e-06,
      "loss": 0.7994,
      "step": 1439
    },
    {
      "epoch": 0.5447838834767805,
      "grad_norm": 1.0605981769829262,
      "learning_rate": 7.841965543018651e-06,
      "loss": 0.8085,
      "step": 1440
    },
    {
      "epoch": 0.5451622056180838,
      "grad_norm": 1.0471718815415907,
      "learning_rate": 7.841474590235981e-06,
      "loss": 0.8463,
      "step": 1441
    },
    {
      "epoch": 0.5455405277593871,
      "grad_norm": 1.0505867574083267,
      "learning_rate": 7.840982891450483e-06,
      "loss": 0.8242,
      "step": 1442
    },
    {
      "epoch": 0.5459188499006904,
      "grad_norm": 1.0445952963424892,
      "learning_rate": 7.840490446757645e-06,
      "loss": 0.7749,
      "step": 1443
    },
    {
      "epoch": 0.5462971720419938,
      "grad_norm": 1.0068778649332644,
      "learning_rate": 7.839997256253096e-06,
      "loss": 0.8116,
      "step": 1444
    },
    {
      "epoch": 0.5466754941832971,
      "grad_norm": 1.00961692913919,
      "learning_rate": 7.839503320032612e-06,
      "loss": 0.7901,
      "step": 1445
    },
    {
      "epoch": 0.5470538163246004,
      "grad_norm": 0.9780075250092127,
      "learning_rate": 7.839008638192115e-06,
      "loss": 0.7885,
      "step": 1446
    },
    {
      "epoch": 0.5474321384659037,
      "grad_norm": 1.100812581357096,
      "learning_rate": 7.838513210827671e-06,
      "loss": 0.8001,
      "step": 1447
    },
    {
      "epoch": 0.5478104606072071,
      "grad_norm": 1.0494389505966184,
      "learning_rate": 7.83801703803549e-06,
      "loss": 0.7977,
      "step": 1448
    },
    {
      "epoch": 0.5481887827485104,
      "grad_norm": 1.034386181938751,
      "learning_rate": 7.837520119911927e-06,
      "loss": 0.8244,
      "step": 1449
    },
    {
      "epoch": 0.5485671048898136,
      "grad_norm": 1.0112131883045796,
      "learning_rate": 7.837022456553482e-06,
      "loss": 0.7537,
      "step": 1450
    },
    {
      "epoch": 0.548945427031117,
      "grad_norm": 1.0542214842469684,
      "learning_rate": 7.836524048056801e-06,
      "loss": 0.8436,
      "step": 1451
    },
    {
      "epoch": 0.5493237491724203,
      "grad_norm": 1.0139124551358574,
      "learning_rate": 7.836024894518673e-06,
      "loss": 0.7765,
      "step": 1452
    },
    {
      "epoch": 0.5497020713137236,
      "grad_norm": 1.0370438053735662,
      "learning_rate": 7.835524996036031e-06,
      "loss": 0.7957,
      "step": 1453
    },
    {
      "epoch": 0.5500803934550269,
      "grad_norm": 1.0403261101993466,
      "learning_rate": 7.835024352705953e-06,
      "loss": 0.8082,
      "step": 1454
    },
    {
      "epoch": 0.5504587155963303,
      "grad_norm": 1.0223772000926137,
      "learning_rate": 7.834522964625665e-06,
      "loss": 0.8091,
      "step": 1455
    },
    {
      "epoch": 0.5508370377376336,
      "grad_norm": 0.9867288417868126,
      "learning_rate": 7.834020831892534e-06,
      "loss": 0.7971,
      "step": 1456
    },
    {
      "epoch": 0.5512153598789369,
      "grad_norm": 1.038419907192562,
      "learning_rate": 7.833517954604074e-06,
      "loss": 0.7774,
      "step": 1457
    },
    {
      "epoch": 0.5515936820202403,
      "grad_norm": 1.0143771814537008,
      "learning_rate": 7.833014332857939e-06,
      "loss": 0.7763,
      "step": 1458
    },
    {
      "epoch": 0.5519720041615436,
      "grad_norm": 1.0001756819325087,
      "learning_rate": 7.832509966751933e-06,
      "loss": 0.7889,
      "step": 1459
    },
    {
      "epoch": 0.5523503263028469,
      "grad_norm": 1.036257856076326,
      "learning_rate": 7.832004856384001e-06,
      "loss": 0.7901,
      "step": 1460
    },
    {
      "epoch": 0.5527286484441502,
      "grad_norm": 1.0355156315068814,
      "learning_rate": 7.831499001852236e-06,
      "loss": 0.7742,
      "step": 1461
    },
    {
      "epoch": 0.5531069705854536,
      "grad_norm": 1.1407334044483102,
      "learning_rate": 7.830992403254873e-06,
      "loss": 0.8265,
      "step": 1462
    },
    {
      "epoch": 0.5534852927267568,
      "grad_norm": 1.0063557289156941,
      "learning_rate": 7.83048506069029e-06,
      "loss": 0.7994,
      "step": 1463
    },
    {
      "epoch": 0.5534852927267568,
      "eval_loss": 0.8094308972358704,
      "eval_runtime": 26.9598,
      "eval_samples_per_second": 32.827,
      "eval_steps_per_second": 1.039,
      "step": 1463
    },
    {
      "epoch": 0.5534852927267568,
      "eval_bench_accuracy_arc_challenge": 0.25,
      "eval_bench_accuracy_hellaswag": 0.215,
      "eval_bench_accuracy_mmlu": 0.2608695652173913,
      "eval_bench_average_accuracy": 0.24195652173913043,
      "eval_bench_loss": 6.063661274157073,
      "eval_bench_total_accuracy": 0.23736263736263735,
      "step": 1463
    },
    {
      "epoch": 0.5538636148680601,
      "grad_norm": 1.0744841523132298,
      "learning_rate": 7.829976974257012e-06,
      "loss": 0.8504,
      "step": 1464
    },
    {
      "epoch": 0.5542419370093635,
      "grad_norm": 1.0186917057516884,
      "learning_rate": 7.829468144053712e-06,
      "loss": 0.8052,
      "step": 1465
    },
    {
      "epoch": 0.5546202591506668,
      "grad_norm": 1.0107687681368964,
      "learning_rate": 7.828958570179196e-06,
      "loss": 0.8094,
      "step": 1466
    },
    {
      "epoch": 0.5549985812919701,
      "grad_norm": 1.0349853318053726,
      "learning_rate": 7.828448252732428e-06,
      "loss": 0.8303,
      "step": 1467
    },
    {
      "epoch": 0.5553769034332734,
      "grad_norm": 1.0450694598466956,
      "learning_rate": 7.827937191812508e-06,
      "loss": 0.7924,
      "step": 1468
    },
    {
      "epoch": 0.5557552255745768,
      "grad_norm": 1.0278598268440422,
      "learning_rate": 7.82742538751868e-06,
      "loss": 0.7701,
      "step": 1469
    },
    {
      "epoch": 0.5561335477158801,
      "grad_norm": 1.0315097348678433,
      "learning_rate": 7.826912839950338e-06,
      "loss": 0.7643,
      "step": 1470
    },
    {
      "epoch": 0.5565118698571834,
      "grad_norm": 1.0630245419936848,
      "learning_rate": 7.826399549207016e-06,
      "loss": 0.8334,
      "step": 1471
    },
    {
      "epoch": 0.5568901919984867,
      "grad_norm": 1.057495631028003,
      "learning_rate": 7.825885515388394e-06,
      "loss": 0.8098,
      "step": 1472
    },
    {
      "epoch": 0.5572685141397901,
      "grad_norm": 1.0485936898987425,
      "learning_rate": 7.825370738594296e-06,
      "loss": 0.8524,
      "step": 1473
    },
    {
      "epoch": 0.5576468362810933,
      "grad_norm": 1.089800751911175,
      "learning_rate": 7.82485521892469e-06,
      "loss": 0.7807,
      "step": 1474
    },
    {
      "epoch": 0.5580251584223966,
      "grad_norm": 1.008238694676228,
      "learning_rate": 7.824338956479687e-06,
      "loss": 0.7641,
      "step": 1475
    },
    {
      "epoch": 0.5584034805637,
      "grad_norm": 0.9866356509513795,
      "learning_rate": 7.823821951359546e-06,
      "loss": 0.8072,
      "step": 1476
    },
    {
      "epoch": 0.5587818027050033,
      "grad_norm": 1.0159932518028019,
      "learning_rate": 7.823304203664665e-06,
      "loss": 0.7563,
      "step": 1477
    },
    {
      "epoch": 0.5591601248463066,
      "grad_norm": 1.0691391299613169,
      "learning_rate": 7.82278571349559e-06,
      "loss": 0.7666,
      "step": 1478
    },
    {
      "epoch": 0.5595384469876099,
      "grad_norm": 1.069708560088697,
      "learning_rate": 7.822266480953014e-06,
      "loss": 0.8094,
      "step": 1479
    },
    {
      "epoch": 0.5599167691289133,
      "grad_norm": 1.0399404229309808,
      "learning_rate": 7.821746506137766e-06,
      "loss": 0.8041,
      "step": 1480
    },
    {
      "epoch": 0.5602950912702166,
      "grad_norm": 1.0528966086217326,
      "learning_rate": 7.821225789150823e-06,
      "loss": 0.8186,
      "step": 1481
    },
    {
      "epoch": 0.5606734134115199,
      "grad_norm": 1.078154168587184,
      "learning_rate": 7.820704330093309e-06,
      "loss": 0.7697,
      "step": 1482
    },
    {
      "epoch": 0.5610517355528233,
      "grad_norm": 0.9974199242655317,
      "learning_rate": 7.82018212906649e-06,
      "loss": 0.7627,
      "step": 1483
    },
    {
      "epoch": 0.5614300576941266,
      "grad_norm": 1.0441157570327169,
      "learning_rate": 7.819659186171774e-06,
      "loss": 0.7637,
      "step": 1484
    },
    {
      "epoch": 0.5618083798354299,
      "grad_norm": 1.0350192453023053,
      "learning_rate": 7.819135501510717e-06,
      "loss": 0.7863,
      "step": 1485
    },
    {
      "epoch": 0.5621867019767331,
      "grad_norm": 1.0314197771080482,
      "learning_rate": 7.818611075185016e-06,
      "loss": 0.7761,
      "step": 1486
    },
    {
      "epoch": 0.5625650241180365,
      "grad_norm": 1.1142918188982494,
      "learning_rate": 7.818085907296514e-06,
      "loss": 0.8451,
      "step": 1487
    },
    {
      "epoch": 0.5629433462593398,
      "grad_norm": 1.0635918190610065,
      "learning_rate": 7.817559997947194e-06,
      "loss": 0.7987,
      "step": 1488
    },
    {
      "epoch": 0.5633216684006431,
      "grad_norm": 1.0137296000615337,
      "learning_rate": 7.817033347239188e-06,
      "loss": 0.7849,
      "step": 1489
    },
    {
      "epoch": 0.5636999905419464,
      "grad_norm": 1.0465836630867722,
      "learning_rate": 7.816505955274772e-06,
      "loss": 0.7609,
      "step": 1490
    },
    {
      "epoch": 0.5640783126832498,
      "grad_norm": 1.0227869394316658,
      "learning_rate": 7.81597782215636e-06,
      "loss": 0.7658,
      "step": 1491
    },
    {
      "epoch": 0.5644566348245531,
      "grad_norm": 1.025273340871076,
      "learning_rate": 7.815448947986518e-06,
      "loss": 0.7943,
      "step": 1492
    },
    {
      "epoch": 0.5648349569658564,
      "grad_norm": 1.0788965118297305,
      "learning_rate": 7.814919332867948e-06,
      "loss": 0.7825,
      "step": 1493
    },
    {
      "epoch": 0.5652132791071598,
      "grad_norm": 1.0290788502294095,
      "learning_rate": 7.814388976903501e-06,
      "loss": 0.7686,
      "step": 1494
    },
    {
      "epoch": 0.5655916012484631,
      "grad_norm": 1.0043872677988737,
      "learning_rate": 7.813857880196172e-06,
      "loss": 0.765,
      "step": 1495
    },
    {
      "epoch": 0.5659699233897664,
      "grad_norm": 1.0416556353562665,
      "learning_rate": 7.813326042849096e-06,
      "loss": 0.7905,
      "step": 1496
    },
    {
      "epoch": 0.5663482455310697,
      "grad_norm": 1.0403767458597168,
      "learning_rate": 7.812793464965557e-06,
      "loss": 0.8392,
      "step": 1497
    },
    {
      "epoch": 0.5667265676723731,
      "grad_norm": 1.0804135578705913,
      "learning_rate": 7.812260146648978e-06,
      "loss": 0.8042,
      "step": 1498
    },
    {
      "epoch": 0.5671048898136763,
      "grad_norm": 1.0525290992619953,
      "learning_rate": 7.811726088002928e-06,
      "loss": 0.8125,
      "step": 1499
    },
    {
      "epoch": 0.5674832119549796,
      "grad_norm": 1.0443809449733452,
      "learning_rate": 7.81119128913112e-06,
      "loss": 0.8449,
      "step": 1500
    },
    {
      "epoch": 0.567861534096283,
      "grad_norm": 1.0484442830821317,
      "learning_rate": 7.810655750137408e-06,
      "loss": 0.791,
      "step": 1501
    },
    {
      "epoch": 0.5682398562375863,
      "grad_norm": 1.0322889324418691,
      "learning_rate": 7.810119471125797e-06,
      "loss": 0.7638,
      "step": 1502
    },
    {
      "epoch": 0.5686181783788896,
      "grad_norm": 1.0251619422017846,
      "learning_rate": 7.809582452200428e-06,
      "loss": 0.7971,
      "step": 1503
    },
    {
      "epoch": 0.5689965005201929,
      "grad_norm": 1.0150926516902954,
      "learning_rate": 7.809044693465587e-06,
      "loss": 0.7734,
      "step": 1504
    },
    {
      "epoch": 0.5693748226614963,
      "grad_norm": 1.0663474541629985,
      "learning_rate": 7.808506195025707e-06,
      "loss": 0.8411,
      "step": 1505
    },
    {
      "epoch": 0.5697531448027996,
      "grad_norm": 1.0708265848333849,
      "learning_rate": 7.807966956985363e-06,
      "loss": 0.8428,
      "step": 1506
    },
    {
      "epoch": 0.5701314669441029,
      "grad_norm": 1.0294311898641297,
      "learning_rate": 7.807426979449273e-06,
      "loss": 0.8016,
      "step": 1507
    },
    {
      "epoch": 0.5705097890854062,
      "grad_norm": 1.072155935601359,
      "learning_rate": 7.806886262522298e-06,
      "loss": 0.7896,
      "step": 1508
    },
    {
      "epoch": 0.5708881112267096,
      "grad_norm": 1.0602457428763656,
      "learning_rate": 7.806344806309445e-06,
      "loss": 0.8306,
      "step": 1509
    },
    {
      "epoch": 0.5712664333680129,
      "grad_norm": 1.0410264668234372,
      "learning_rate": 7.805802610915862e-06,
      "loss": 0.7708,
      "step": 1510
    },
    {
      "epoch": 0.5716447555093161,
      "grad_norm": 1.0323609766839155,
      "learning_rate": 7.805259676446843e-06,
      "loss": 0.7731,
      "step": 1511
    },
    {
      "epoch": 0.5720230776506195,
      "grad_norm": 1.0629777585594808,
      "learning_rate": 7.804716003007825e-06,
      "loss": 0.8667,
      "step": 1512
    },
    {
      "epoch": 0.5724013997919228,
      "grad_norm": 0.9991092397744588,
      "learning_rate": 7.804171590704384e-06,
      "loss": 0.8158,
      "step": 1513
    },
    {
      "epoch": 0.5727797219332261,
      "grad_norm": 1.0691406196971251,
      "learning_rate": 7.803626439642245e-06,
      "loss": 0.8439,
      "step": 1514
    },
    {
      "epoch": 0.5731580440745294,
      "grad_norm": 1.003105717691004,
      "learning_rate": 7.803080549927276e-06,
      "loss": 0.8294,
      "step": 1515
    },
    {
      "epoch": 0.5735363662158328,
      "grad_norm": 1.03908547211568,
      "learning_rate": 7.802533921665487e-06,
      "loss": 0.7924,
      "step": 1516
    },
    {
      "epoch": 0.5739146883571361,
      "grad_norm": 1.0879350896154778,
      "learning_rate": 7.801986554963032e-06,
      "loss": 0.8214,
      "step": 1517
    },
    {
      "epoch": 0.5742930104984394,
      "grad_norm": 1.0215923317383557,
      "learning_rate": 7.801438449926204e-06,
      "loss": 0.7672,
      "step": 1518
    },
    {
      "epoch": 0.5746713326397428,
      "grad_norm": 1.0667625852082359,
      "learning_rate": 7.800889606661448e-06,
      "loss": 0.779,
      "step": 1519
    },
    {
      "epoch": 0.5750496547810461,
      "grad_norm": 1.0265205651578218,
      "learning_rate": 7.800340025275346e-06,
      "loss": 0.8048,
      "step": 1520
    },
    {
      "epoch": 0.5754279769223494,
      "grad_norm": 1.07228233508983,
      "learning_rate": 7.799789705874626e-06,
      "loss": 0.7798,
      "step": 1521
    },
    {
      "epoch": 0.5758062990636527,
      "grad_norm": 1.0864037890509946,
      "learning_rate": 7.799238648566155e-06,
      "loss": 0.8061,
      "step": 1522
    },
    {
      "epoch": 0.5761846212049561,
      "grad_norm": 1.024552729289987,
      "learning_rate": 7.79868685345695e-06,
      "loss": 0.7923,
      "step": 1523
    },
    {
      "epoch": 0.5765629433462593,
      "grad_norm": 1.050893206442173,
      "learning_rate": 7.798134320654169e-06,
      "loss": 0.7922,
      "step": 1524
    },
    {
      "epoch": 0.5769412654875626,
      "grad_norm": 1.0361508996059923,
      "learning_rate": 7.797581050265108e-06,
      "loss": 0.7934,
      "step": 1525
    },
    {
      "epoch": 0.5773195876288659,
      "grad_norm": 1.0710969406799804,
      "learning_rate": 7.797027042397215e-06,
      "loss": 0.8126,
      "step": 1526
    },
    {
      "epoch": 0.5776979097701693,
      "grad_norm": 1.0658020905692465,
      "learning_rate": 7.796472297158071e-06,
      "loss": 0.825,
      "step": 1527
    },
    {
      "epoch": 0.5780762319114726,
      "grad_norm": 1.0530236797299208,
      "learning_rate": 7.79591681465541e-06,
      "loss": 0.8297,
      "step": 1528
    },
    {
      "epoch": 0.5784545540527759,
      "grad_norm": 1.0375398854054054,
      "learning_rate": 7.795360594997107e-06,
      "loss": 0.8184,
      "step": 1529
    },
    {
      "epoch": 0.5788328761940793,
      "grad_norm": 1.0223176641231346,
      "learning_rate": 7.794803638291175e-06,
      "loss": 0.8081,
      "step": 1530
    },
    {
      "epoch": 0.5792111983353826,
      "grad_norm": 1.0392507145784662,
      "learning_rate": 7.794245944645772e-06,
      "loss": 0.8473,
      "step": 1531
    },
    {
      "epoch": 0.5795895204766859,
      "grad_norm": 1.022490501012432,
      "learning_rate": 7.793687514169201e-06,
      "loss": 0.7883,
      "step": 1532
    },
    {
      "epoch": 0.5799678426179892,
      "grad_norm": 1.0564202458689138,
      "learning_rate": 7.793128346969911e-06,
      "loss": 0.7797,
      "step": 1533
    },
    {
      "epoch": 0.5803461647592926,
      "grad_norm": 1.0741330485557585,
      "learning_rate": 7.792568443156489e-06,
      "loss": 0.808,
      "step": 1534
    },
    {
      "epoch": 0.5807244869005959,
      "grad_norm": 0.9936986392860392,
      "learning_rate": 7.792007802837665e-06,
      "loss": 0.7748,
      "step": 1535
    },
    {
      "epoch": 0.5811028090418991,
      "grad_norm": 1.04388957808874,
      "learning_rate": 7.791446426122313e-06,
      "loss": 0.8282,
      "step": 1536
    },
    {
      "epoch": 0.5814811311832025,
      "grad_norm": 1.0718346958784504,
      "learning_rate": 7.790884313119454e-06,
      "loss": 0.7922,
      "step": 1537
    },
    {
      "epoch": 0.5818594533245058,
      "grad_norm": 1.0477864953037763,
      "learning_rate": 7.790321463938246e-06,
      "loss": 0.8141,
      "step": 1538
    },
    {
      "epoch": 0.5822377754658091,
      "grad_norm": 1.026774949013717,
      "learning_rate": 7.789757878687995e-06,
      "loss": 0.7598,
      "step": 1539
    },
    {
      "epoch": 0.5826160976071124,
      "grad_norm": 1.015538072369435,
      "learning_rate": 7.789193557478143e-06,
      "loss": 0.7877,
      "step": 1540
    },
    {
      "epoch": 0.5829944197484158,
      "grad_norm": 1.0348274415641654,
      "learning_rate": 7.788628500418287e-06,
      "loss": 0.8258,
      "step": 1541
    },
    {
      "epoch": 0.5833727418897191,
      "grad_norm": 1.02268572106111,
      "learning_rate": 7.788062707618151e-06,
      "loss": 0.8323,
      "step": 1542
    },
    {
      "epoch": 0.5837510640310224,
      "grad_norm": 1.0046192564851208,
      "learning_rate": 7.787496179187618e-06,
      "loss": 0.7522,
      "step": 1543
    },
    {
      "epoch": 0.5841293861723257,
      "grad_norm": 1.0526322563558683,
      "learning_rate": 7.7869289152367e-06,
      "loss": 0.8168,
      "step": 1544
    },
    {
      "epoch": 0.5845077083136291,
      "grad_norm": 0.9819648563646498,
      "learning_rate": 7.78636091587556e-06,
      "loss": 0.7441,
      "step": 1545
    },
    {
      "epoch": 0.5848860304549324,
      "grad_norm": 1.0131957579824842,
      "learning_rate": 7.785792181214504e-06,
      "loss": 0.7716,
      "step": 1546
    },
    {
      "epoch": 0.5852643525962357,
      "grad_norm": 1.0442706083972597,
      "learning_rate": 7.785222711363975e-06,
      "loss": 0.783,
      "step": 1547
    },
    {
      "epoch": 0.5856426747375391,
      "grad_norm": 1.024417321524946,
      "learning_rate": 7.784652506434564e-06,
      "loss": 0.808,
      "step": 1548
    },
    {
      "epoch": 0.5860209968788423,
      "grad_norm": 1.0597851794054838,
      "learning_rate": 7.784081566537004e-06,
      "loss": 0.8209,
      "step": 1549
    },
    {
      "epoch": 0.5863993190201456,
      "grad_norm": 1.0122874466478462,
      "learning_rate": 7.783509891782168e-06,
      "loss": 0.7717,
      "step": 1550
    },
    {
      "epoch": 0.5867776411614489,
      "grad_norm": 1.0075483569470989,
      "learning_rate": 7.782937482281076e-06,
      "loss": 0.7653,
      "step": 1551
    },
    {
      "epoch": 0.5871559633027523,
      "grad_norm": 1.021446573700645,
      "learning_rate": 7.782364338144885e-06,
      "loss": 0.7696,
      "step": 1552
    },
    {
      "epoch": 0.5875342854440556,
      "grad_norm": 1.0432444836660548,
      "learning_rate": 7.781790459484901e-06,
      "loss": 0.7933,
      "step": 1553
    },
    {
      "epoch": 0.5879126075853589,
      "grad_norm": 1.0051174216679133,
      "learning_rate": 7.781215846412565e-06,
      "loss": 0.7867,
      "step": 1554
    },
    {
      "epoch": 0.5882909297266623,
      "grad_norm": 1.0867512164890576,
      "learning_rate": 7.78064049903947e-06,
      "loss": 0.7725,
      "step": 1555
    },
    {
      "epoch": 0.5886692518679656,
      "grad_norm": 1.04980942321374,
      "learning_rate": 7.780064417477346e-06,
      "loss": 0.8114,
      "step": 1556
    },
    {
      "epoch": 0.5890475740092689,
      "grad_norm": 1.0617568995349125,
      "learning_rate": 7.779487601838065e-06,
      "loss": 0.7859,
      "step": 1557
    },
    {
      "epoch": 0.5894258961505722,
      "grad_norm": 1.0628832051157708,
      "learning_rate": 7.778910052233642e-06,
      "loss": 0.8021,
      "step": 1558
    },
    {
      "epoch": 0.5898042182918756,
      "grad_norm": 1.0898131031337233,
      "learning_rate": 7.778331768776237e-06,
      "loss": 0.802,
      "step": 1559
    },
    {
      "epoch": 0.5901825404331789,
      "grad_norm": 1.0649413521341573,
      "learning_rate": 7.77775275157815e-06,
      "loss": 0.8217,
      "step": 1560
    },
    {
      "epoch": 0.5905608625744821,
      "grad_norm": 1.0368511400497493,
      "learning_rate": 7.777173000751825e-06,
      "loss": 0.7819,
      "step": 1561
    },
    {
      "epoch": 0.5909391847157854,
      "grad_norm": 1.020241580639323,
      "learning_rate": 7.776592516409848e-06,
      "loss": 0.8435,
      "step": 1562
    },
    {
      "epoch": 0.5913175068570888,
      "grad_norm": 1.039218236167864,
      "learning_rate": 7.776011298664945e-06,
      "loss": 0.822,
      "step": 1563
    },
    {
      "epoch": 0.5916958289983921,
      "grad_norm": 1.0277738056724017,
      "learning_rate": 7.775429347629992e-06,
      "loss": 0.7755,
      "step": 1564
    },
    {
      "epoch": 0.5920741511396954,
      "grad_norm": 0.9767055405759969,
      "learning_rate": 7.774846663417996e-06,
      "loss": 0.8259,
      "step": 1565
    },
    {
      "epoch": 0.5924524732809988,
      "grad_norm": 1.0409555633420142,
      "learning_rate": 7.774263246142116e-06,
      "loss": 0.7829,
      "step": 1566
    },
    {
      "epoch": 0.5928307954223021,
      "grad_norm": 1.0275312312209073,
      "learning_rate": 7.77367909591565e-06,
      "loss": 0.7724,
      "step": 1567
    },
    {
      "epoch": 0.5932091175636054,
      "grad_norm": 1.0128232786560865,
      "learning_rate": 7.773094212852036e-06,
      "loss": 0.778,
      "step": 1568
    },
    {
      "epoch": 0.5935874397049087,
      "grad_norm": 1.010220293379828,
      "learning_rate": 7.77250859706486e-06,
      "loss": 0.8122,
      "step": 1569
    },
    {
      "epoch": 0.5939657618462121,
      "grad_norm": 1.0377569519031766,
      "learning_rate": 7.771922248667843e-06,
      "loss": 0.7944,
      "step": 1570
    },
    {
      "epoch": 0.5943440839875154,
      "grad_norm": 1.0056143743542545,
      "learning_rate": 7.771335167774855e-06,
      "loss": 0.8184,
      "step": 1571
    },
    {
      "epoch": 0.5947224061288187,
      "grad_norm": 1.0823167997700618,
      "learning_rate": 7.770747354499902e-06,
      "loss": 0.793,
      "step": 1572
    },
    {
      "epoch": 0.5951007282701221,
      "grad_norm": 1.005554310069684,
      "learning_rate": 7.770158808957142e-06,
      "loss": 0.8294,
      "step": 1573
    },
    {
      "epoch": 0.5954790504114253,
      "grad_norm": 1.016774447299906,
      "learning_rate": 7.769569531260861e-06,
      "loss": 0.7916,
      "step": 1574
    },
    {
      "epoch": 0.5958573725527286,
      "grad_norm": 0.9815704963237092,
      "learning_rate": 7.7689795215255e-06,
      "loss": 0.7873,
      "step": 1575
    },
    {
      "epoch": 0.5962356946940319,
      "grad_norm": 1.054358096080715,
      "learning_rate": 7.768388779865636e-06,
      "loss": 0.8164,
      "step": 1576
    },
    {
      "epoch": 0.5966140168353353,
      "grad_norm": 0.9774109882411877,
      "learning_rate": 7.767797306395988e-06,
      "loss": 0.791,
      "step": 1577
    },
    {
      "epoch": 0.5969923389766386,
      "grad_norm": 1.0358457305091455,
      "learning_rate": 7.76720510123142e-06,
      "loss": 0.7707,
      "step": 1578
    },
    {
      "epoch": 0.5973706611179419,
      "grad_norm": 1.0624591531096403,
      "learning_rate": 7.766612164486936e-06,
      "loss": 0.8472,
      "step": 1579
    },
    {
      "epoch": 0.5977489832592452,
      "grad_norm": 0.9928836589328845,
      "learning_rate": 7.766018496277682e-06,
      "loss": 0.7902,
      "step": 1580
    },
    {
      "epoch": 0.5981273054005486,
      "grad_norm": 1.0280490587815976,
      "learning_rate": 7.765424096718946e-06,
      "loss": 0.7841,
      "step": 1581
    },
    {
      "epoch": 0.5985056275418519,
      "grad_norm": 0.9873621543820231,
      "learning_rate": 7.76482896592616e-06,
      "loss": 0.8006,
      "step": 1582
    },
    {
      "epoch": 0.5988839496831552,
      "grad_norm": 1.0709729821860812,
      "learning_rate": 7.764233104014897e-06,
      "loss": 0.8682,
      "step": 1583
    },
    {
      "epoch": 0.5992622718244586,
      "grad_norm": 0.9867939695157474,
      "learning_rate": 7.76363651110087e-06,
      "loss": 0.7879,
      "step": 1584
    },
    {
      "epoch": 0.5996405939657619,
      "grad_norm": 1.0795152732921542,
      "learning_rate": 7.763039187299937e-06,
      "loss": 0.815,
      "step": 1585
    },
    {
      "epoch": 0.6000189161070651,
      "grad_norm": 0.9899000945502743,
      "learning_rate": 7.762441132728095e-06,
      "loss": 0.7855,
      "step": 1586
    },
    {
      "epoch": 0.6003972382483684,
      "grad_norm": 1.0252908086535142,
      "learning_rate": 7.761842347501485e-06,
      "loss": 0.8165,
      "step": 1587
    },
    {
      "epoch": 0.6007755603896718,
      "grad_norm": 1.0423466115896767,
      "learning_rate": 7.76124283173639e-06,
      "loss": 0.8567,
      "step": 1588
    },
    {
      "epoch": 0.6011538825309751,
      "grad_norm": 0.9948472361654808,
      "learning_rate": 7.760642585549233e-06,
      "loss": 0.7931,
      "step": 1589
    },
    {
      "epoch": 0.6015322046722784,
      "grad_norm": 0.9998595808495474,
      "learning_rate": 7.760041609056582e-06,
      "loss": 0.7922,
      "step": 1590
    },
    {
      "epoch": 0.6019105268135818,
      "grad_norm": 1.0113044627393564,
      "learning_rate": 7.759439902375141e-06,
      "loss": 0.7983,
      "step": 1591
    },
    {
      "epoch": 0.6022888489548851,
      "grad_norm": 1.052771258939431,
      "learning_rate": 7.758837465621764e-06,
      "loss": 0.8088,
      "step": 1592
    },
    {
      "epoch": 0.6026671710961884,
      "grad_norm": 1.0123858085251436,
      "learning_rate": 7.758234298913439e-06,
      "loss": 0.784,
      "step": 1593
    },
    {
      "epoch": 0.6030454932374917,
      "grad_norm": 1.0337794095975905,
      "learning_rate": 7.757630402367303e-06,
      "loss": 0.7997,
      "step": 1594
    },
    {
      "epoch": 0.6034238153787951,
      "grad_norm": 0.9846999031423823,
      "learning_rate": 7.757025776100625e-06,
      "loss": 0.7447,
      "step": 1595
    },
    {
      "epoch": 0.6038021375200984,
      "grad_norm": 1.0462409901802558,
      "learning_rate": 7.756420420230828e-06,
      "loss": 0.7686,
      "step": 1596
    },
    {
      "epoch": 0.6038021375200984,
      "eval_loss": 0.8007391691207886,
      "eval_runtime": 27.0514,
      "eval_samples_per_second": 32.715,
      "eval_steps_per_second": 1.035,
      "step": 1596
    },
    {
      "epoch": 0.6038021375200984,
      "eval_bench_accuracy_arc_challenge": 0.25,
      "eval_bench_accuracy_hellaswag": 0.21,
      "eval_bench_accuracy_mmlu": 0.25217391304347825,
      "eval_bench_average_accuracy": 0.23739130434782607,
      "eval_bench_loss": 6.375945509525767,
      "eval_bench_total_accuracy": 0.23296703296703297,
      "step": 1596
    },
    {
      "epoch": 0.6041804596614017,
      "grad_norm": 1.0790625835061922,
      "learning_rate": 7.755814334875466e-06,
      "loss": 0.8091,
      "step": 1597
    },
    {
      "epoch": 0.6045587818027051,
      "grad_norm": 0.9802043723000299,
      "learning_rate": 7.75520752015224e-06,
      "loss": 0.7256,
      "step": 1598
    },
    {
      "epoch": 0.6049371039440083,
      "grad_norm": 0.9923431981852016,
      "learning_rate": 7.754599976178994e-06,
      "loss": 0.8054,
      "step": 1599
    },
    {
      "epoch": 0.6053154260853116,
      "grad_norm": 1.0242822958979938,
      "learning_rate": 7.753991703073709e-06,
      "loss": 0.7947,
      "step": 1600
    },
    {
      "epoch": 0.6056937482266149,
      "grad_norm": 1.0693420250669043,
      "learning_rate": 7.75338270095451e-06,
      "loss": 0.7714,
      "step": 1601
    },
    {
      "epoch": 0.6060720703679183,
      "grad_norm": 1.0393417772805222,
      "learning_rate": 7.752772969939662e-06,
      "loss": 0.7984,
      "step": 1602
    },
    {
      "epoch": 0.6064503925092216,
      "grad_norm": 1.0193556335184584,
      "learning_rate": 7.752162510147576e-06,
      "loss": 0.7845,
      "step": 1603
    },
    {
      "epoch": 0.6068287146505249,
      "grad_norm": 1.0439223450090194,
      "learning_rate": 7.751551321696798e-06,
      "loss": 0.7902,
      "step": 1604
    },
    {
      "epoch": 0.6072070367918282,
      "grad_norm": 1.0458764132750307,
      "learning_rate": 7.75093940470602e-06,
      "loss": 0.8277,
      "step": 1605
    },
    {
      "epoch": 0.6075853589331316,
      "grad_norm": 1.0304823323522874,
      "learning_rate": 7.750326759294077e-06,
      "loss": 0.7936,
      "step": 1606
    },
    {
      "epoch": 0.6079636810744349,
      "grad_norm": 1.037572458907066,
      "learning_rate": 7.749713385579942e-06,
      "loss": 0.779,
      "step": 1607
    },
    {
      "epoch": 0.6083420032157382,
      "grad_norm": 1.0233220079303753,
      "learning_rate": 7.749099283682727e-06,
      "loss": 0.7924,
      "step": 1608
    },
    {
      "epoch": 0.6087203253570416,
      "grad_norm": 1.0490780083116327,
      "learning_rate": 7.748484453721694e-06,
      "loss": 0.8337,
      "step": 1609
    },
    {
      "epoch": 0.6090986474983449,
      "grad_norm": 1.0173257743419322,
      "learning_rate": 7.747868895816236e-06,
      "loss": 0.7673,
      "step": 1610
    },
    {
      "epoch": 0.6094769696396481,
      "grad_norm": 1.0573789547993953,
      "learning_rate": 7.747252610085895e-06,
      "loss": 0.8377,
      "step": 1611
    },
    {
      "epoch": 0.6098552917809514,
      "grad_norm": 1.0257255841383113,
      "learning_rate": 7.746635596650352e-06,
      "loss": 0.7728,
      "step": 1612
    },
    {
      "epoch": 0.6102336139222548,
      "grad_norm": 1.0160660389387,
      "learning_rate": 7.746017855629429e-06,
      "loss": 0.8025,
      "step": 1613
    },
    {
      "epoch": 0.6106119360635581,
      "grad_norm": 1.0602513504043805,
      "learning_rate": 7.74539938714309e-06,
      "loss": 0.7925,
      "step": 1614
    },
    {
      "epoch": 0.6109902582048614,
      "grad_norm": 1.0377020898351703,
      "learning_rate": 7.744780191311437e-06,
      "loss": 0.804,
      "step": 1615
    },
    {
      "epoch": 0.6113685803461648,
      "grad_norm": 0.9962327806446186,
      "learning_rate": 7.744160268254718e-06,
      "loss": 0.7463,
      "step": 1616
    },
    {
      "epoch": 0.6117469024874681,
      "grad_norm": 1.03576395621217,
      "learning_rate": 7.743539618093323e-06,
      "loss": 0.8125,
      "step": 1617
    },
    {
      "epoch": 0.6121252246287714,
      "grad_norm": 1.0791330433766595,
      "learning_rate": 7.742918240947774e-06,
      "loss": 0.7497,
      "step": 1618
    },
    {
      "epoch": 0.6125035467700747,
      "grad_norm": 1.0186732713870292,
      "learning_rate": 7.742296136938745e-06,
      "loss": 0.7715,
      "step": 1619
    },
    {
      "epoch": 0.6128818689113781,
      "grad_norm": 1.0549459798818361,
      "learning_rate": 7.741673306187047e-06,
      "loss": 0.7663,
      "step": 1620
    },
    {
      "epoch": 0.6132601910526814,
      "grad_norm": 0.9830530108058492,
      "learning_rate": 7.74104974881363e-06,
      "loss": 0.8146,
      "step": 1621
    },
    {
      "epoch": 0.6136385131939847,
      "grad_norm": 1.0384186325465743,
      "learning_rate": 7.74042546493959e-06,
      "loss": 0.7864,
      "step": 1622
    },
    {
      "epoch": 0.614016835335288,
      "grad_norm": 1.050915873907994,
      "learning_rate": 7.739800454686156e-06,
      "loss": 0.7966,
      "step": 1623
    },
    {
      "epoch": 0.6143951574765913,
      "grad_norm": 1.0241953725880033,
      "learning_rate": 7.739174718174705e-06,
      "loss": 0.7659,
      "step": 1624
    },
    {
      "epoch": 0.6147734796178946,
      "grad_norm": 1.0278047735993348,
      "learning_rate": 7.738548255526757e-06,
      "loss": 0.7753,
      "step": 1625
    },
    {
      "epoch": 0.6151518017591979,
      "grad_norm": 1.0028879958633992,
      "learning_rate": 7.737921066863963e-06,
      "loss": 0.798,
      "step": 1626
    },
    {
      "epoch": 0.6155301239005013,
      "grad_norm": 1.046709030024919,
      "learning_rate": 7.737293152308125e-06,
      "loss": 0.8318,
      "step": 1627
    },
    {
      "epoch": 0.6159084460418046,
      "grad_norm": 1.053664353449831,
      "learning_rate": 7.736664511981184e-06,
      "loss": 0.8518,
      "step": 1628
    },
    {
      "epoch": 0.6162867681831079,
      "grad_norm": 0.9978105688058767,
      "learning_rate": 7.736035146005216e-06,
      "loss": 0.7807,
      "step": 1629
    },
    {
      "epoch": 0.6166650903244112,
      "grad_norm": 1.0998599207938173,
      "learning_rate": 7.735405054502443e-06,
      "loss": 0.8517,
      "step": 1630
    },
    {
      "epoch": 0.6170434124657146,
      "grad_norm": 1.0347549984516864,
      "learning_rate": 7.734774237595227e-06,
      "loss": 0.7861,
      "step": 1631
    },
    {
      "epoch": 0.6174217346070179,
      "grad_norm": 1.0604030894353325,
      "learning_rate": 7.734142695406072e-06,
      "loss": 0.8444,
      "step": 1632
    },
    {
      "epoch": 0.6178000567483212,
      "grad_norm": 0.9995358654268639,
      "learning_rate": 7.73351042805762e-06,
      "loss": 0.7982,
      "step": 1633
    },
    {
      "epoch": 0.6181783788896246,
      "grad_norm": 1.012063791302332,
      "learning_rate": 7.732877435672656e-06,
      "loss": 0.7891,
      "step": 1634
    },
    {
      "epoch": 0.6185567010309279,
      "grad_norm": 1.062079535667684,
      "learning_rate": 7.732243718374105e-06,
      "loss": 0.7953,
      "step": 1635
    },
    {
      "epoch": 0.6189350231722311,
      "grad_norm": 1.0049506132948145,
      "learning_rate": 7.731609276285034e-06,
      "loss": 0.8185,
      "step": 1636
    },
    {
      "epoch": 0.6193133453135344,
      "grad_norm": 0.9787699976228371,
      "learning_rate": 7.730974109528651e-06,
      "loss": 0.8099,
      "step": 1637
    },
    {
      "epoch": 0.6196916674548378,
      "grad_norm": 0.9716390457115083,
      "learning_rate": 7.730338218228298e-06,
      "loss": 0.7695,
      "step": 1638
    },
    {
      "epoch": 0.6200699895961411,
      "grad_norm": 0.9806455110749785,
      "learning_rate": 7.729701602507469e-06,
      "loss": 0.7199,
      "step": 1639
    },
    {
      "epoch": 0.6204483117374444,
      "grad_norm": 1.0303904399928674,
      "learning_rate": 7.729064262489791e-06,
      "loss": 0.8018,
      "step": 1640
    },
    {
      "epoch": 0.6208266338787477,
      "grad_norm": 1.0184745198287024,
      "learning_rate": 7.72842619829903e-06,
      "loss": 0.8168,
      "step": 1641
    },
    {
      "epoch": 0.6212049560200511,
      "grad_norm": 1.0350761019221557,
      "learning_rate": 7.727787410059102e-06,
      "loss": 0.8063,
      "step": 1642
    },
    {
      "epoch": 0.6215832781613544,
      "grad_norm": 0.9997598615132083,
      "learning_rate": 7.727147897894055e-06,
      "loss": 0.7692,
      "step": 1643
    },
    {
      "epoch": 0.6219616003026577,
      "grad_norm": 1.0317018080080016,
      "learning_rate": 7.72650766192808e-06,
      "loss": 0.7963,
      "step": 1644
    },
    {
      "epoch": 0.6223399224439611,
      "grad_norm": 1.058330305743686,
      "learning_rate": 7.725866702285508e-06,
      "loss": 0.7778,
      "step": 1645
    },
    {
      "epoch": 0.6227182445852644,
      "grad_norm": 1.050475543436919,
      "learning_rate": 7.725225019090813e-06,
      "loss": 0.8052,
      "step": 1646
    },
    {
      "epoch": 0.6230965667265677,
      "grad_norm": 1.0381951307937078,
      "learning_rate": 7.724582612468609e-06,
      "loss": 0.7643,
      "step": 1647
    },
    {
      "epoch": 0.623474888867871,
      "grad_norm": 0.9960696467209328,
      "learning_rate": 7.723939482543647e-06,
      "loss": 0.781,
      "step": 1648
    },
    {
      "epoch": 0.6238532110091743,
      "grad_norm": 1.0235710160288658,
      "learning_rate": 7.723295629440823e-06,
      "loss": 0.7818,
      "step": 1649
    },
    {
      "epoch": 0.6242315331504776,
      "grad_norm": 0.9987662526618373,
      "learning_rate": 7.722651053285168e-06,
      "loss": 0.7532,
      "step": 1650
    },
    {
      "epoch": 0.6246098552917809,
      "grad_norm": 1.038603322649077,
      "learning_rate": 7.722005754201863e-06,
      "loss": 0.7995,
      "step": 1651
    },
    {
      "epoch": 0.6249881774330843,
      "grad_norm": 1.0372844825153233,
      "learning_rate": 7.721359732316216e-06,
      "loss": 0.7982,
      "step": 1652
    },
    {
      "epoch": 0.6253664995743876,
      "grad_norm": 1.0075983510701718,
      "learning_rate": 7.720712987753687e-06,
      "loss": 0.771,
      "step": 1653
    },
    {
      "epoch": 0.6257448217156909,
      "grad_norm": 1.060885095951037,
      "learning_rate": 7.72006552063987e-06,
      "loss": 0.8095,
      "step": 1654
    },
    {
      "epoch": 0.6261231438569942,
      "grad_norm": 1.024942261074342,
      "learning_rate": 7.719417331100501e-06,
      "loss": 0.8175,
      "step": 1655
    },
    {
      "epoch": 0.6265014659982976,
      "grad_norm": 1.0259969128854978,
      "learning_rate": 7.718768419261458e-06,
      "loss": 0.7614,
      "step": 1656
    },
    {
      "epoch": 0.6268797881396009,
      "grad_norm": 1.0032297451874017,
      "learning_rate": 7.718118785248759e-06,
      "loss": 0.7612,
      "step": 1657
    },
    {
      "epoch": 0.6272581102809042,
      "grad_norm": 1.0210932763381098,
      "learning_rate": 7.717468429188556e-06,
      "loss": 0.7755,
      "step": 1658
    },
    {
      "epoch": 0.6276364324222075,
      "grad_norm": 1.046603168853803,
      "learning_rate": 7.71681735120715e-06,
      "loss": 0.7888,
      "step": 1659
    },
    {
      "epoch": 0.6280147545635109,
      "grad_norm": 1.0302944601931032,
      "learning_rate": 7.716165551430978e-06,
      "loss": 0.8215,
      "step": 1660
    },
    {
      "epoch": 0.6283930767048141,
      "grad_norm": 1.0538426037667707,
      "learning_rate": 7.715513029986616e-06,
      "loss": 0.8277,
      "step": 1661
    },
    {
      "epoch": 0.6287713988461174,
      "grad_norm": 1.0079131456868133,
      "learning_rate": 7.714859787000784e-06,
      "loss": 0.7898,
      "step": 1662
    },
    {
      "epoch": 0.6291497209874208,
      "grad_norm": 1.0091132558305784,
      "learning_rate": 7.714205822600338e-06,
      "loss": 0.7628,
      "step": 1663
    },
    {
      "epoch": 0.6295280431287241,
      "grad_norm": 1.0370707510362853,
      "learning_rate": 7.713551136912277e-06,
      "loss": 0.7847,
      "step": 1664
    },
    {
      "epoch": 0.6299063652700274,
      "grad_norm": 1.0254976981220805,
      "learning_rate": 7.712895730063737e-06,
      "loss": 0.8251,
      "step": 1665
    },
    {
      "epoch": 0.6302846874113307,
      "grad_norm": 1.0129086665617333,
      "learning_rate": 7.712239602181998e-06,
      "loss": 0.813,
      "step": 1666
    },
    {
      "epoch": 0.6306630095526341,
      "grad_norm": 1.0211770501504658,
      "learning_rate": 7.711582753394478e-06,
      "loss": 0.7909,
      "step": 1667
    },
    {
      "epoch": 0.6310413316939374,
      "grad_norm": 1.2302756712980163,
      "learning_rate": 7.710925183828736e-06,
      "loss": 0.782,
      "step": 1668
    },
    {
      "epoch": 0.6314196538352407,
      "grad_norm": 1.0606820966683679,
      "learning_rate": 7.710266893612468e-06,
      "loss": 0.8001,
      "step": 1669
    },
    {
      "epoch": 0.6317979759765441,
      "grad_norm": 1.0257958327969605,
      "learning_rate": 7.70960788287351e-06,
      "loss": 0.7715,
      "step": 1670
    },
    {
      "epoch": 0.6321762981178474,
      "grad_norm": 1.033181617178253,
      "learning_rate": 7.708948151739847e-06,
      "loss": 0.7884,
      "step": 1671
    },
    {
      "epoch": 0.6325546202591507,
      "grad_norm": 1.0142271201151716,
      "learning_rate": 7.708287700339588e-06,
      "loss": 0.7846,
      "step": 1672
    },
    {
      "epoch": 0.632932942400454,
      "grad_norm": 1.0581952369577206,
      "learning_rate": 7.707626528800999e-06,
      "loss": 0.835,
      "step": 1673
    },
    {
      "epoch": 0.6333112645417573,
      "grad_norm": 1.031831226064096,
      "learning_rate": 7.706964637252472e-06,
      "loss": 0.7808,
      "step": 1674
    },
    {
      "epoch": 0.6336895866830606,
      "grad_norm": 1.034926042820135,
      "learning_rate": 7.706302025822546e-06,
      "loss": 0.8133,
      "step": 1675
    },
    {
      "epoch": 0.6340679088243639,
      "grad_norm": 0.9974796232689039,
      "learning_rate": 7.705638694639897e-06,
      "loss": 0.8022,
      "step": 1676
    },
    {
      "epoch": 0.6344462309656672,
      "grad_norm": 0.9991746871631939,
      "learning_rate": 7.704974643833345e-06,
      "loss": 0.7768,
      "step": 1677
    },
    {
      "epoch": 0.6348245531069706,
      "grad_norm": 1.0647934668234986,
      "learning_rate": 7.704309873531842e-06,
      "loss": 0.7784,
      "step": 1678
    },
    {
      "epoch": 0.6352028752482739,
      "grad_norm": 1.0706641503151557,
      "learning_rate": 7.70364438386449e-06,
      "loss": 0.7549,
      "step": 1679
    },
    {
      "epoch": 0.6355811973895772,
      "grad_norm": 1.5575289700539314,
      "learning_rate": 7.70297817496052e-06,
      "loss": 0.7869,
      "step": 1680
    },
    {
      "epoch": 0.6359595195308806,
      "grad_norm": 1.0441884975223152,
      "learning_rate": 7.702311246949312e-06,
      "loss": 0.8212,
      "step": 1681
    },
    {
      "epoch": 0.6363378416721839,
      "grad_norm": 1.0184875000693254,
      "learning_rate": 7.701643599960377e-06,
      "loss": 0.7783,
      "step": 1682
    },
    {
      "epoch": 0.6367161638134872,
      "grad_norm": 1.056484375092538,
      "learning_rate": 7.700975234123374e-06,
      "loss": 0.7997,
      "step": 1683
    },
    {
      "epoch": 0.6370944859547905,
      "grad_norm": 1.0158431220473627,
      "learning_rate": 7.700306149568096e-06,
      "loss": 0.7887,
      "step": 1684
    },
    {
      "epoch": 0.6374728080960939,
      "grad_norm": 1.005886147632736,
      "learning_rate": 7.699636346424476e-06,
      "loss": 0.8146,
      "step": 1685
    },
    {
      "epoch": 0.6378511302373971,
      "grad_norm": 0.9516674282028371,
      "learning_rate": 7.698965824822591e-06,
      "loss": 0.7617,
      "step": 1686
    },
    {
      "epoch": 0.6382294523787004,
      "grad_norm": 1.0354398239486777,
      "learning_rate": 7.698294584892653e-06,
      "loss": 0.7698,
      "step": 1687
    },
    {
      "epoch": 0.6386077745200038,
      "grad_norm": 1.0412153778199809,
      "learning_rate": 7.69762262676501e-06,
      "loss": 0.7741,
      "step": 1688
    },
    {
      "epoch": 0.6389860966613071,
      "grad_norm": 1.0038063833719368,
      "learning_rate": 7.696949950570162e-06,
      "loss": 0.7726,
      "step": 1689
    },
    {
      "epoch": 0.6393644188026104,
      "grad_norm": 1.0041297661402129,
      "learning_rate": 7.696276556438736e-06,
      "loss": 0.8076,
      "step": 1690
    },
    {
      "epoch": 0.6397427409439137,
      "grad_norm": 1.052469874333398,
      "learning_rate": 7.695602444501503e-06,
      "loss": 0.7906,
      "step": 1691
    },
    {
      "epoch": 0.6401210630852171,
      "grad_norm": 0.9490194460452617,
      "learning_rate": 7.694927614889376e-06,
      "loss": 0.7188,
      "step": 1692
    },
    {
      "epoch": 0.6404993852265204,
      "grad_norm": 0.974323163548883,
      "learning_rate": 7.694252067733404e-06,
      "loss": 0.753,
      "step": 1693
    },
    {
      "epoch": 0.6408777073678237,
      "grad_norm": 1.0319007840691403,
      "learning_rate": 7.693575803164774e-06,
      "loss": 0.7962,
      "step": 1694
    },
    {
      "epoch": 0.641256029509127,
      "grad_norm": 1.0299952133041577,
      "learning_rate": 7.692898821314816e-06,
      "loss": 0.7723,
      "step": 1695
    },
    {
      "epoch": 0.6416343516504304,
      "grad_norm": 1.0632785008902024,
      "learning_rate": 7.692221122315e-06,
      "loss": 0.7536,
      "step": 1696
    },
    {
      "epoch": 0.6420126737917337,
      "grad_norm": 1.0478356927175443,
      "learning_rate": 7.69154270629693e-06,
      "loss": 0.7759,
      "step": 1697
    },
    {
      "epoch": 0.642390995933037,
      "grad_norm": 1.0207221782050084,
      "learning_rate": 7.690863573392355e-06,
      "loss": 0.8025,
      "step": 1698
    },
    {
      "epoch": 0.6427693180743403,
      "grad_norm": 1.0307450911725362,
      "learning_rate": 7.690183723733158e-06,
      "loss": 0.8126,
      "step": 1699
    },
    {
      "epoch": 0.6431476402156436,
      "grad_norm": 0.9558201805744811,
      "learning_rate": 7.689503157451366e-06,
      "loss": 0.7926,
      "step": 1700
    },
    {
      "epoch": 0.6435259623569469,
      "grad_norm": 0.9839314509833194,
      "learning_rate": 7.68882187467914e-06,
      "loss": 0.7982,
      "step": 1701
    },
    {
      "epoch": 0.6439042844982502,
      "grad_norm": 1.0446036605229558,
      "learning_rate": 7.688139875548786e-06,
      "loss": 0.7424,
      "step": 1702
    },
    {
      "epoch": 0.6442826066395536,
      "grad_norm": 0.9747599328413645,
      "learning_rate": 7.687457160192746e-06,
      "loss": 0.7769,
      "step": 1703
    },
    {
      "epoch": 0.6446609287808569,
      "grad_norm": 1.0017104708165576,
      "learning_rate": 7.6867737287436e-06,
      "loss": 0.7779,
      "step": 1704
    },
    {
      "epoch": 0.6450392509221602,
      "grad_norm": 1.0396981093860427,
      "learning_rate": 7.686089581334069e-06,
      "loss": 0.7966,
      "step": 1705
    },
    {
      "epoch": 0.6454175730634636,
      "grad_norm": 1.0077578946931687,
      "learning_rate": 7.685404718097011e-06,
      "loss": 0.7658,
      "step": 1706
    },
    {
      "epoch": 0.6457958952047669,
      "grad_norm": 1.0045936301109948,
      "learning_rate": 7.684719139165426e-06,
      "loss": 0.8215,
      "step": 1707
    },
    {
      "epoch": 0.6461742173460702,
      "grad_norm": 1.0059220607870412,
      "learning_rate": 7.684032844672452e-06,
      "loss": 0.784,
      "step": 1708
    },
    {
      "epoch": 0.6465525394873735,
      "grad_norm": 1.002030780249217,
      "learning_rate": 7.683345834751362e-06,
      "loss": 0.754,
      "step": 1709
    },
    {
      "epoch": 0.6469308616286769,
      "grad_norm": 1.0524082695853973,
      "learning_rate": 7.682658109535575e-06,
      "loss": 0.8141,
      "step": 1710
    },
    {
      "epoch": 0.6473091837699801,
      "grad_norm": 1.023391717099541,
      "learning_rate": 7.681969669158643e-06,
      "loss": 0.8029,
      "step": 1711
    },
    {
      "epoch": 0.6476875059112834,
      "grad_norm": 1.0537878870256816,
      "learning_rate": 7.68128051375426e-06,
      "loss": 0.8026,
      "step": 1712
    },
    {
      "epoch": 0.6480658280525867,
      "grad_norm": 0.9946301646936768,
      "learning_rate": 7.680590643456258e-06,
      "loss": 0.8154,
      "step": 1713
    },
    {
      "epoch": 0.6484441501938901,
      "grad_norm": 1.0129808485922718,
      "learning_rate": 7.679900058398606e-06,
      "loss": 0.7482,
      "step": 1714
    },
    {
      "epoch": 0.6488224723351934,
      "grad_norm": 1.1366026781982712,
      "learning_rate": 7.679208758715417e-06,
      "loss": 0.7844,
      "step": 1715
    },
    {
      "epoch": 0.6492007944764967,
      "grad_norm": 1.0252138838659255,
      "learning_rate": 7.678516744540936e-06,
      "loss": 0.7827,
      "step": 1716
    },
    {
      "epoch": 0.6495791166178001,
      "grad_norm": 1.0483329033578623,
      "learning_rate": 7.67782401600955e-06,
      "loss": 0.7995,
      "step": 1717
    },
    {
      "epoch": 0.6499574387591034,
      "grad_norm": 0.9954302178962173,
      "learning_rate": 7.677130573255787e-06,
      "loss": 0.7528,
      "step": 1718
    },
    {
      "epoch": 0.6503357609004067,
      "grad_norm": 1.0342284002896778,
      "learning_rate": 7.67643641641431e-06,
      "loss": 0.7967,
      "step": 1719
    },
    {
      "epoch": 0.65071408304171,
      "grad_norm": 1.0744541931554912,
      "learning_rate": 7.675741545619926e-06,
      "loss": 0.7959,
      "step": 1720
    },
    {
      "epoch": 0.6510924051830134,
      "grad_norm": 0.9960576642926111,
      "learning_rate": 7.675045961007571e-06,
      "loss": 0.7644,
      "step": 1721
    },
    {
      "epoch": 0.6514707273243167,
      "grad_norm": 1.0388432797415568,
      "learning_rate": 7.674349662712328e-06,
      "loss": 0.8452,
      "step": 1722
    },
    {
      "epoch": 0.65184904946562,
      "grad_norm": 1.0809172859395315,
      "learning_rate": 7.673652650869415e-06,
      "loss": 0.8068,
      "step": 1723
    },
    {
      "epoch": 0.6522273716069233,
      "grad_norm": 1.0066539502318497,
      "learning_rate": 7.672954925614193e-06,
      "loss": 0.7709,
      "step": 1724
    },
    {
      "epoch": 0.6526056937482266,
      "grad_norm": 1.0418268199259764,
      "learning_rate": 7.672256487082155e-06,
      "loss": 0.7932,
      "step": 1725
    },
    {
      "epoch": 0.6529840158895299,
      "grad_norm": 1.0245053090908052,
      "learning_rate": 7.671557335408935e-06,
      "loss": 0.798,
      "step": 1726
    },
    {
      "epoch": 0.6533623380308332,
      "grad_norm": 1.0356795152001224,
      "learning_rate": 7.670857470730309e-06,
      "loss": 0.7573,
      "step": 1727
    },
    {
      "epoch": 0.6537406601721366,
      "grad_norm": 1.0311220411463944,
      "learning_rate": 7.670156893182188e-06,
      "loss": 0.8159,
      "step": 1728
    },
    {
      "epoch": 0.6541189823134399,
      "grad_norm": 0.9968740214468425,
      "learning_rate": 7.66945560290062e-06,
      "loss": 0.8174,
      "step": 1729
    },
    {
      "epoch": 0.6541189823134399,
      "eval_loss": 0.7927515506744385,
      "eval_runtime": 26.7774,
      "eval_samples_per_second": 33.05,
      "eval_steps_per_second": 1.046,
      "step": 1729
    },
    {
      "epoch": 0.6541189823134399,
      "eval_bench_accuracy_arc_challenge": 0.0,
      "eval_bench_accuracy_hellaswag": 0.21,
      "eval_bench_accuracy_mmlu": 0.23478260869565218,
      "eval_bench_average_accuracy": 0.1482608695652174,
      "eval_bench_loss": 7.814903928522478,
      "eval_bench_total_accuracy": 0.15164835164835164,
      "step": 1729
    },
    {
      "epoch": 0.6544973044547432,
      "grad_norm": 1.0536869570872927,
      "learning_rate": 7.668753600021795e-06,
      "loss": 0.7894,
      "step": 1730
    },
    {
      "epoch": 0.6548756265960465,
      "grad_norm": 1.0802849973303468,
      "learning_rate": 7.66805088468204e-06,
      "loss": 0.8128,
      "step": 1731
    },
    {
      "epoch": 0.6552539487373499,
      "grad_norm": 1.0195535501035122,
      "learning_rate": 7.66734745701782e-06,
      "loss": 0.7698,
      "step": 1732
    },
    {
      "epoch": 0.6556322708786532,
      "grad_norm": 0.9866819845303567,
      "learning_rate": 7.666643317165737e-06,
      "loss": 0.7632,
      "step": 1733
    },
    {
      "epoch": 0.6560105930199565,
      "grad_norm": 1.0362620307566515,
      "learning_rate": 7.665938465262536e-06,
      "loss": 0.8242,
      "step": 1734
    },
    {
      "epoch": 0.6563889151612599,
      "grad_norm": 1.005122320879091,
      "learning_rate": 7.665232901445093e-06,
      "loss": 0.8128,
      "step": 1735
    },
    {
      "epoch": 0.6567672373025631,
      "grad_norm": 0.9968147052835493,
      "learning_rate": 7.66452662585043e-06,
      "loss": 0.7765,
      "step": 1736
    },
    {
      "epoch": 0.6571455594438664,
      "grad_norm": 1.0160098359583503,
      "learning_rate": 7.663819638615705e-06,
      "loss": 0.769,
      "step": 1737
    },
    {
      "epoch": 0.6575238815851697,
      "grad_norm": 0.9957799905329473,
      "learning_rate": 7.663111939878207e-06,
      "loss": 0.75,
      "step": 1738
    },
    {
      "epoch": 0.6579022037264731,
      "grad_norm": 0.9817964252654222,
      "learning_rate": 7.662403529775372e-06,
      "loss": 0.7814,
      "step": 1739
    },
    {
      "epoch": 0.6582805258677764,
      "grad_norm": 0.9928916742992132,
      "learning_rate": 7.661694408444773e-06,
      "loss": 0.7904,
      "step": 1740
    },
    {
      "epoch": 0.6586588480090797,
      "grad_norm": 1.0410892155118083,
      "learning_rate": 7.660984576024117e-06,
      "loss": 0.8191,
      "step": 1741
    },
    {
      "epoch": 0.6590371701503831,
      "grad_norm": 1.0021028586166405,
      "learning_rate": 7.660274032651249e-06,
      "loss": 0.7712,
      "step": 1742
    },
    {
      "epoch": 0.6594154922916864,
      "grad_norm": 0.9990600675172764,
      "learning_rate": 7.65956277846416e-06,
      "loss": 0.7857,
      "step": 1743
    },
    {
      "epoch": 0.6597938144329897,
      "grad_norm": 1.0992751750590166,
      "learning_rate": 7.658850813600969e-06,
      "loss": 0.7878,
      "step": 1744
    },
    {
      "epoch": 0.660172136574293,
      "grad_norm": 1.0189976892843522,
      "learning_rate": 7.65813813819994e-06,
      "loss": 0.77,
      "step": 1745
    },
    {
      "epoch": 0.6605504587155964,
      "grad_norm": 1.0468429508760897,
      "learning_rate": 7.657424752399471e-06,
      "loss": 0.7768,
      "step": 1746
    },
    {
      "epoch": 0.6609287808568997,
      "grad_norm": 1.0374665153019,
      "learning_rate": 7.6567106563381e-06,
      "loss": 0.8103,
      "step": 1747
    },
    {
      "epoch": 0.661307102998203,
      "grad_norm": 1.0713460469365848,
      "learning_rate": 7.655995850154501e-06,
      "loss": 0.7646,
      "step": 1748
    },
    {
      "epoch": 0.6616854251395063,
      "grad_norm": 1.048711304359486,
      "learning_rate": 7.655280333987491e-06,
      "loss": 0.7852,
      "step": 1749
    },
    {
      "epoch": 0.6620637472808096,
      "grad_norm": 1.0319143016049546,
      "learning_rate": 7.654564107976017e-06,
      "loss": 0.7979,
      "step": 1750
    },
    {
      "epoch": 0.6624420694221129,
      "grad_norm": 1.0575930996275595,
      "learning_rate": 7.653847172259169e-06,
      "loss": 0.7768,
      "step": 1751
    },
    {
      "epoch": 0.6628203915634162,
      "grad_norm": 0.9638702778680636,
      "learning_rate": 7.653129526976173e-06,
      "loss": 0.7979,
      "step": 1752
    },
    {
      "epoch": 0.6631987137047196,
      "grad_norm": 0.9690337454201767,
      "learning_rate": 7.652411172266398e-06,
      "loss": 0.7894,
      "step": 1753
    },
    {
      "epoch": 0.6635770358460229,
      "grad_norm": 1.0072303768845905,
      "learning_rate": 7.65169210826934e-06,
      "loss": 0.7302,
      "step": 1754
    },
    {
      "epoch": 0.6639553579873262,
      "grad_norm": 1.0168462219112109,
      "learning_rate": 7.650972335124644e-06,
      "loss": 0.7918,
      "step": 1755
    },
    {
      "epoch": 0.6643336801286295,
      "grad_norm": 0.9845272479814176,
      "learning_rate": 7.650251852972084e-06,
      "loss": 0.7798,
      "step": 1756
    },
    {
      "epoch": 0.6647120022699329,
      "grad_norm": 1.0559359255774574,
      "learning_rate": 7.649530661951578e-06,
      "loss": 0.7835,
      "step": 1757
    },
    {
      "epoch": 0.6650903244112362,
      "grad_norm": 1.0127474528668845,
      "learning_rate": 7.64880876220318e-06,
      "loss": 0.7566,
      "step": 1758
    },
    {
      "epoch": 0.6654686465525395,
      "grad_norm": 1.067173774382862,
      "learning_rate": 7.648086153867078e-06,
      "loss": 0.7738,
      "step": 1759
    },
    {
      "epoch": 0.6658469686938429,
      "grad_norm": 1.0262747793123224,
      "learning_rate": 7.6473628370836e-06,
      "loss": 0.7833,
      "step": 1760
    },
    {
      "epoch": 0.6662252908351461,
      "grad_norm": 1.0515582564211456,
      "learning_rate": 7.646638811993216e-06,
      "loss": 0.7538,
      "step": 1761
    },
    {
      "epoch": 0.6666036129764494,
      "grad_norm": 1.0329994771612065,
      "learning_rate": 7.645914078736526e-06,
      "loss": 0.8164,
      "step": 1762
    },
    {
      "epoch": 0.6669819351177527,
      "grad_norm": 1.0311907540077614,
      "learning_rate": 7.645188637454272e-06,
      "loss": 0.7706,
      "step": 1763
    },
    {
      "epoch": 0.6673602572590561,
      "grad_norm": 1.0409947640223565,
      "learning_rate": 7.644462488287334e-06,
      "loss": 0.7885,
      "step": 1764
    },
    {
      "epoch": 0.6677385794003594,
      "grad_norm": 0.988219756000234,
      "learning_rate": 7.643735631376724e-06,
      "loss": 0.7408,
      "step": 1765
    },
    {
      "epoch": 0.6681169015416627,
      "grad_norm": 1.027004288225805,
      "learning_rate": 7.643008066863598e-06,
      "loss": 0.8121,
      "step": 1766
    },
    {
      "epoch": 0.6684952236829661,
      "grad_norm": 1.0184065601333092,
      "learning_rate": 7.642279794889249e-06,
      "loss": 0.7576,
      "step": 1767
    },
    {
      "epoch": 0.6688735458242694,
      "grad_norm": 1.043603934502605,
      "learning_rate": 7.641550815595102e-06,
      "loss": 0.771,
      "step": 1768
    },
    {
      "epoch": 0.6692518679655727,
      "grad_norm": 1.060392114018632,
      "learning_rate": 7.640821129122723e-06,
      "loss": 0.8247,
      "step": 1769
    },
    {
      "epoch": 0.669630190106876,
      "grad_norm": 1.0126323816870029,
      "learning_rate": 7.640090735613818e-06,
      "loss": 0.8022,
      "step": 1770
    },
    {
      "epoch": 0.6700085122481794,
      "grad_norm": 1.1648366101787067,
      "learning_rate": 7.639359635210222e-06,
      "loss": 0.7826,
      "step": 1771
    },
    {
      "epoch": 0.6703868343894827,
      "grad_norm": 1.0724674686904885,
      "learning_rate": 7.638627828053918e-06,
      "loss": 0.7897,
      "step": 1772
    },
    {
      "epoch": 0.6707651565307859,
      "grad_norm": 1.0540972019117152,
      "learning_rate": 7.637895314287016e-06,
      "loss": 0.7645,
      "step": 1773
    },
    {
      "epoch": 0.6711434786720892,
      "grad_norm": 1.0057331810331451,
      "learning_rate": 7.63716209405177e-06,
      "loss": 0.816,
      "step": 1774
    },
    {
      "epoch": 0.6715218008133926,
      "grad_norm": 0.9970921236923102,
      "learning_rate": 7.63642816749057e-06,
      "loss": 0.7671,
      "step": 1775
    },
    {
      "epoch": 0.6719001229546959,
      "grad_norm": 1.002453880727358,
      "learning_rate": 7.635693534745941e-06,
      "loss": 0.7885,
      "step": 1776
    },
    {
      "epoch": 0.6722784450959992,
      "grad_norm": 1.0312771975163908,
      "learning_rate": 7.634958195960548e-06,
      "loss": 0.7951,
      "step": 1777
    },
    {
      "epoch": 0.6726567672373026,
      "grad_norm": 1.0177245342291783,
      "learning_rate": 7.634222151277188e-06,
      "loss": 0.773,
      "step": 1778
    },
    {
      "epoch": 0.6730350893786059,
      "grad_norm": 1.060998481737934,
      "learning_rate": 7.633485400838804e-06,
      "loss": 0.7924,
      "step": 1779
    },
    {
      "epoch": 0.6734134115199092,
      "grad_norm": 1.0340561242421995,
      "learning_rate": 7.632747944788468e-06,
      "loss": 0.8451,
      "step": 1780
    },
    {
      "epoch": 0.6737917336612125,
      "grad_norm": 1.0461873170538059,
      "learning_rate": 7.63200978326939e-06,
      "loss": 0.7896,
      "step": 1781
    },
    {
      "epoch": 0.6741700558025159,
      "grad_norm": 1.0320131696114871,
      "learning_rate": 7.631270916424923e-06,
      "loss": 0.7914,
      "step": 1782
    },
    {
      "epoch": 0.6745483779438192,
      "grad_norm": 1.0291951526102714,
      "learning_rate": 7.630531344398549e-06,
      "loss": 0.7273,
      "step": 1783
    },
    {
      "epoch": 0.6749267000851225,
      "grad_norm": 1.0352838518441736,
      "learning_rate": 7.62979106733389e-06,
      "loss": 0.8042,
      "step": 1784
    },
    {
      "epoch": 0.6753050222264259,
      "grad_norm": 0.999179215624018,
      "learning_rate": 7.629050085374709e-06,
      "loss": 0.8106,
      "step": 1785
    },
    {
      "epoch": 0.6756833443677291,
      "grad_norm": 1.002781374078623,
      "learning_rate": 7.6283083986649e-06,
      "loss": 0.7478,
      "step": 1786
    },
    {
      "epoch": 0.6760616665090324,
      "grad_norm": 1.0578987973117508,
      "learning_rate": 7.627566007348498e-06,
      "loss": 0.767,
      "step": 1787
    },
    {
      "epoch": 0.6764399886503357,
      "grad_norm": 1.018623825083434,
      "learning_rate": 7.626822911569673e-06,
      "loss": 0.7603,
      "step": 1788
    },
    {
      "epoch": 0.6768183107916391,
      "grad_norm": 1.0691359310227244,
      "learning_rate": 7.62607911147273e-06,
      "loss": 0.8033,
      "step": 1789
    },
    {
      "epoch": 0.6771966329329424,
      "grad_norm": 1.0473330500599638,
      "learning_rate": 7.625334607202115e-06,
      "loss": 0.799,
      "step": 1790
    },
    {
      "epoch": 0.6775749550742457,
      "grad_norm": 1.0276960283606948,
      "learning_rate": 7.624589398902408e-06,
      "loss": 0.7882,
      "step": 1791
    },
    {
      "epoch": 0.677953277215549,
      "grad_norm": 1.0216841452284737,
      "learning_rate": 7.623843486718325e-06,
      "loss": 0.7753,
      "step": 1792
    },
    {
      "epoch": 0.6783315993568524,
      "grad_norm": 1.017840190852707,
      "learning_rate": 7.623096870794722e-06,
      "loss": 0.7944,
      "step": 1793
    },
    {
      "epoch": 0.6787099214981557,
      "grad_norm": 1.0234534365543315,
      "learning_rate": 7.6223495512765865e-06,
      "loss": 0.7607,
      "step": 1794
    },
    {
      "epoch": 0.679088243639459,
      "grad_norm": 1.0142595858519063,
      "learning_rate": 7.621601528309049e-06,
      "loss": 0.7665,
      "step": 1795
    },
    {
      "epoch": 0.6794665657807624,
      "grad_norm": 1.0071219703193526,
      "learning_rate": 7.620852802037371e-06,
      "loss": 0.791,
      "step": 1796
    },
    {
      "epoch": 0.6798448879220657,
      "grad_norm": 1.0031377757032336,
      "learning_rate": 7.620103372606954e-06,
      "loss": 0.7502,
      "step": 1797
    },
    {
      "epoch": 0.6802232100633689,
      "grad_norm": 1.014284865797237,
      "learning_rate": 7.619353240163334e-06,
      "loss": 0.8012,
      "step": 1798
    },
    {
      "epoch": 0.6806015322046722,
      "grad_norm": 1.0281456730858456,
      "learning_rate": 7.618602404852186e-06,
      "loss": 0.8308,
      "step": 1799
    },
    {
      "epoch": 0.6809798543459756,
      "grad_norm": 1.0358974761664392,
      "learning_rate": 7.617850866819319e-06,
      "loss": 0.8116,
      "step": 1800
    },
    {
      "epoch": 0.6813581764872789,
      "grad_norm": 1.0233639481564207,
      "learning_rate": 7.61709862621068e-06,
      "loss": 0.8062,
      "step": 1801
    },
    {
      "epoch": 0.6817364986285822,
      "grad_norm": 0.9776086740367372,
      "learning_rate": 7.61634568317235e-06,
      "loss": 0.7926,
      "step": 1802
    }
  ],
  "logging_steps": 1,
  "max_steps": 7929,
  "num_input_tokens_seen": 0,
  "num_train_epochs": 3,
  "save_steps": 53,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 2.967375682524414e+19,
  "train_batch_size": 4,
  "trial_name": null,
  "trial_params": null
}