Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

config.json +41 -0
generation_config.json +6 -0
optimizer.pt +3 -0
pytorch_model.bin +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +3573 -0
training_args.bin +3 -0

config.json ADDED Viewed

	@@ -0,0 +1,41 @@

+{
+  "_name_or_path": "gpt2-medium",
+  "activation_function": "gelu_new",
+  "architectures": [
+    "GPT2LMHeadModel"
+  ],
+  "attn_pdrop": 0.1,
+  "bos_token_id": 50256,
+  "embd_pdrop": 0.1,
+  "eos_token_id": 50256,
+  "initializer_range": 0.02,
+  "layer_norm_epsilon": 1e-05,
+  "model_type": "gpt2",
+  "n_ctx": 1024,
+  "n_embd": 1024,
+  "n_head": 16,
+  "n_inner": null,
+  "n_layer": 24,
+  "n_positions": 1024,
+  "n_special": 0,
+  "predict_special_tokens": true,
+  "reorder_and_upcast_attn": false,
+  "resid_pdrop": 0.1,
+  "scale_attn_by_inverse_layer_idx": false,
+  "scale_attn_weights": true,
+  "summary_activation": null,
+  "summary_first_dropout": 0.1,
+  "summary_proj_to_labels": true,
+  "summary_type": "cls_index",
+  "summary_use_proj": true,
+  "task_specific_params": {
+    "text-generation": {
+      "do_sample": true,
+      "max_length": 50
+    }
+  },
+  "torch_dtype": "float32",
+  "transformers_version": "4.48.1",
+  "use_cache": true,
+  "vocab_size": 50257
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.48.1"
+}

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:db7739905c147b17d274771a25e55851f13f02174b812014d68a1a9cd62907e6
+size 2838829242

pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:096fd587e795a89eeb51c8a0a424af48e0a357ece497f4554e415b0cdfd8ae59
+size 1419388314

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:059726101f5ec710d50a3b485f6858b22467df59fa4ff2ef558ac12ca72bea00
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ac9d3e73e0c21c379b4e6670dc27c8dfe37b6b09e4c0969cc088d2ffcae7c045
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3573 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.02869777005849354,
+  "eval_steps": 100,
+  "global_step": 500,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 5.739554011698708e-05,
+      "grad_norm": 1.8802112340927124,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 2.9438,
+      "step": 1
+    },
+    {
+      "epoch": 0.00011479108023397416,
+      "grad_norm": 1.9408955574035645,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 2.9429,
+      "step": 2
+    },
+    {
+      "epoch": 0.00017218662035096125,
+      "grad_norm": 2.9192652702331543,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 2.952,
+      "step": 3
+    },
+    {
+      "epoch": 0.00022958216046794832,
+      "grad_norm": 2.3403642177581787,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 2.9307,
+      "step": 4
+    },
+    {
+      "epoch": 0.00028697770058493544,
+      "grad_norm": 2.134683847427368,
+      "learning_rate": 0.0003,
+      "loss": 2.8917,
+      "step": 5
+    },
+    {
+      "epoch": 0.0003443732407019225,
+      "grad_norm": 1.5358260869979858,
+      "learning_rate": 0.00035999999999999997,
+      "loss": 2.9205,
+      "step": 6
+    },
+    {
+      "epoch": 0.0004017687808189096,
+      "grad_norm": 0.9012013673782349,
+      "learning_rate": 0.00041999999999999996,
+      "loss": 2.8937,
+      "step": 7
+    },
+    {
+      "epoch": 0.00045916432093589664,
+      "grad_norm": 0.9427694082260132,
+      "learning_rate": 0.00047999999999999996,
+      "loss": 2.904,
+      "step": 8
+    },
+    {
+      "epoch": 0.0005165598610528837,
+      "grad_norm": 1.662156105041504,
+      "learning_rate": 0.00054,
+      "loss": 2.9114,
+      "step": 9
+    },
+    {
+      "epoch": 0.0005739554011698709,
+      "grad_norm": 1.2877967357635498,
+      "learning_rate": 0.0006,
+      "loss": 2.9185,
+      "step": 10
+    },
+    {
+      "epoch": 0.000631350941286858,
+      "grad_norm": 1.3717082738876343,
+      "learning_rate": 0.0005999969170437548,
+      "loss": 2.899,
+      "step": 11
+    },
+    {
+      "epoch": 0.000688746481403845,
+      "grad_norm": 1.3706175088882446,
+      "learning_rate": 0.0005999876683017478,
+      "loss": 2.8522,
+      "step": 12
+    },
+    {
+      "epoch": 0.0007461420215208321,
+      "grad_norm": 0.7431464791297913,
+      "learning_rate": 0.0005999722541541584,
+      "loss": 2.8894,
+      "step": 13
+    },
+    {
+      "epoch": 0.0008035375616378192,
+      "grad_norm": 0.5839619040489197,
+      "learning_rate": 0.0005999506752346019,
+      "loss": 2.8866,
+      "step": 14
+    },
+    {
+      "epoch": 0.0008609331017548062,
+      "grad_norm": 0.5229901671409607,
+      "learning_rate": 0.0005999229324301031,
+      "loss": 2.8608,
+      "step": 15
+    },
+    {
+      "epoch": 0.0009183286418717933,
+      "grad_norm": 0.6879259943962097,
+      "learning_rate": 0.00059988902688106,
+      "loss": 2.8801,
+      "step": 16
+    },
+    {
+      "epoch": 0.0009757241819887805,
+      "grad_norm": 0.4949502646923065,
+      "learning_rate": 0.0005998489599811971,
+      "loss": 2.8857,
+      "step": 17
+    },
+    {
+      "epoch": 0.0010331197221057674,
+      "grad_norm": 0.5659216642379761,
+      "learning_rate": 0.0005998027333775077,
+      "loss": 2.8172,
+      "step": 18
+    },
+    {
+      "epoch": 0.0010905152622227546,
+      "grad_norm": 0.43849167227745056,
+      "learning_rate": 0.0005997503489701861,
+      "loss": 2.8479,
+      "step": 19
+    },
+    {
+      "epoch": 0.0011479108023397418,
+      "grad_norm": 0.5036750435829163,
+      "learning_rate": 0.0005996918089125504,
+      "loss": 2.8957,
+      "step": 20
+    },
+    {
+      "epoch": 0.0012053063424567287,
+      "grad_norm": 0.40093106031417847,
+      "learning_rate": 0.000599627115610953,
+      "loss": 2.8951,
+      "step": 21
+    },
+    {
+      "epoch": 0.001262701882573716,
+      "grad_norm": 0.3499244153499603,
+      "learning_rate": 0.0005995562717246821,
+      "loss": 2.8535,
+      "step": 22
+    },
+    {
+      "epoch": 0.0013200974226907029,
+      "grad_norm": 0.3672889769077301,
+      "learning_rate": 0.0005994792801658526,
+      "loss": 2.8507,
+      "step": 23
+    },
+    {
+      "epoch": 0.00137749296280769,
+      "grad_norm": 0.3307906985282898,
+      "learning_rate": 0.0005993961440992859,
+      "loss": 2.8597,
+      "step": 24
+    },
+    {
+      "epoch": 0.001434888502924677,
+      "grad_norm": 0.33352652192115784,
+      "learning_rate": 0.0005993068669423797,
+      "loss": 2.8023,
+      "step": 25
+    },
+    {
+      "epoch": 0.0014922840430416642,
+      "grad_norm": 0.30308255553245544,
+      "learning_rate": 0.0005992114523649685,
+      "loss": 2.864,
+      "step": 26
+    },
+    {
+      "epoch": 0.0015496795831586513,
+      "grad_norm": 0.2800331711769104,
+      "learning_rate": 0.000599109904289172,
+      "loss": 2.8459,
+      "step": 27
+    },
+    {
+      "epoch": 0.0016070751232756383,
+      "grad_norm": 0.2467849850654602,
+      "learning_rate": 0.0005990022268892337,
+      "loss": 2.8298,
+      "step": 28
+    },
+    {
+      "epoch": 0.0016644706633926255,
+      "grad_norm": 0.25928932428359985,
+      "learning_rate": 0.0005988884245913497,
+      "loss": 2.8061,
+      "step": 29
+    },
+    {
+      "epoch": 0.0017218662035096124,
+      "grad_norm": 0.2770285904407501,
+      "learning_rate": 0.0005987685020734869,
+      "loss": 2.8363,
+      "step": 30
+    },
+    {
+      "epoch": 0.0017792617436265996,
+      "grad_norm": 0.2888840436935425,
+      "learning_rate": 0.0005986424642651901,
+      "loss": 2.847,
+      "step": 31
+    },
+    {
+      "epoch": 0.0018366572837435866,
+      "grad_norm": 0.3389260172843933,
+      "learning_rate": 0.0005985103163473802,
+      "loss": 2.8185,
+      "step": 32
+    },
+    {
+      "epoch": 0.0018940528238605737,
+      "grad_norm": 0.3043622672557831,
+      "learning_rate": 0.0005983720637521404,
+      "loss": 2.8073,
+      "step": 33
+    },
+    {
+      "epoch": 0.001951448363977561,
+      "grad_norm": 0.2626359760761261,
+      "learning_rate": 0.0005982277121624933,
+      "loss": 2.8278,
+      "step": 34
+    },
+    {
+      "epoch": 0.002008843904094548,
+      "grad_norm": 0.2601317763328552,
+      "learning_rate": 0.0005980772675121675,
+      "loss": 2.8293,
+      "step": 35
+    },
+    {
+      "epoch": 0.002066239444211535,
+      "grad_norm": 0.2932066023349762,
+      "learning_rate": 0.0005979207359853532,
+      "loss": 2.842,
+      "step": 36
+    },
+    {
+      "epoch": 0.002123634984328522,
+      "grad_norm": 0.3828963041305542,
+      "learning_rate": 0.0005977581240164485,
+      "loss": 2.8383,
+      "step": 37
+    },
+    {
+      "epoch": 0.002181030524445509,
+      "grad_norm": 0.2928522527217865,
+      "learning_rate": 0.0005975894382897944,
+      "loss": 2.8291,
+      "step": 38
+    },
+    {
+      "epoch": 0.0022384260645624964,
+      "grad_norm": 0.2287234663963318,
+      "learning_rate": 0.0005974146857394005,
+      "loss": 2.8422,
+      "step": 39
+    },
+    {
+      "epoch": 0.0022958216046794835,
+      "grad_norm": 0.2722682058811188,
+      "learning_rate": 0.0005972338735486597,
+      "loss": 2.8217,
+      "step": 40
+    },
+    {
+      "epoch": 0.0023532171447964703,
+      "grad_norm": 0.21170516312122345,
+      "learning_rate": 0.0005970470091500531,
+      "loss": 2.831,
+      "step": 41
+    },
+    {
+      "epoch": 0.0024106126849134575,
+      "grad_norm": 0.22243160009384155,
+      "learning_rate": 0.0005968541002248439,
+      "loss": 2.862,
+      "step": 42
+    },
+    {
+      "epoch": 0.0024680082250304446,
+      "grad_norm": 0.18485133349895477,
+      "learning_rate": 0.0005966551547027627,
+      "loss": 2.8531,
+      "step": 43
+    },
+    {
+      "epoch": 0.002525403765147432,
+      "grad_norm": 0.21640127897262573,
+      "learning_rate": 0.0005964501807616806,
+      "loss": 2.8245,
+      "step": 44
+    },
+    {
+      "epoch": 0.0025827993052644185,
+      "grad_norm": 0.2716100513935089,
+      "learning_rate": 0.0005962391868272735,
+      "loss": 2.8093,
+      "step": 45
+    },
+    {
+      "epoch": 0.0026401948453814057,
+      "grad_norm": 0.19726517796516418,
+      "learning_rate": 0.0005960221815726757,
+      "loss": 2.8214,
+      "step": 46
+    },
+    {
+      "epoch": 0.002697590385498393,
+      "grad_norm": 0.2424098700284958,
+      "learning_rate": 0.0005957991739181231,
+      "loss": 2.818,
+      "step": 47
+    },
+    {
+      "epoch": 0.00275498592561538,
+      "grad_norm": 0.2414388209581375,
+      "learning_rate": 0.0005955701730305872,
+      "loss": 2.8491,
+      "step": 48
+    },
+    {
+      "epoch": 0.0028123814657323673,
+      "grad_norm": 0.25403571128845215,
+      "learning_rate": 0.0005953351883233972,
+      "loss": 2.8321,
+      "step": 49
+    },
+    {
+      "epoch": 0.002869777005849354,
+      "grad_norm": 0.30923786759376526,
+      "learning_rate": 0.0005950942294558544,
+      "loss": 2.8298,
+      "step": 50
+    },
+    {
+      "epoch": 0.002927172545966341,
+      "grad_norm": 0.22294141352176666,
+      "learning_rate": 0.0005948473063328338,
+      "loss": 2.8015,
+      "step": 51
+    },
+    {
+      "epoch": 0.0029845680860833283,
+      "grad_norm": 0.2882789075374603,
+      "learning_rate": 0.0005945944291043779,
+      "loss": 2.8256,
+      "step": 52
+    },
+    {
+      "epoch": 0.0030419636262003155,
+      "grad_norm": 0.25416064262390137,
+      "learning_rate": 0.0005943356081652793,
+      "loss": 2.8211,
+      "step": 53
+    },
+    {
+      "epoch": 0.0030993591663173027,
+      "grad_norm": 0.2488490343093872,
+      "learning_rate": 0.0005940708541546529,
+      "loss": 2.8618,
+      "step": 54
+    },
+    {
+      "epoch": 0.0031567547064342894,
+      "grad_norm": 0.27515849471092224,
+      "learning_rate": 0.000593800177955499,
+      "loss": 2.802,
+      "step": 55
+    },
+    {
+      "epoch": 0.0032141502465512766,
+      "grad_norm": 0.2030380666255951,
+      "learning_rate": 0.0005935235906942563,
+      "loss": 2.8229,
+      "step": 56
+    },
+    {
+      "epoch": 0.003271545786668264,
+      "grad_norm": 0.2384052276611328,
+      "learning_rate": 0.0005932411037403436,
+      "loss": 2.8122,
+      "step": 57
+    },
+    {
+      "epoch": 0.003328941326785251,
+      "grad_norm": 0.2543489336967468,
+      "learning_rate": 0.000592952728705693,
+      "loss": 2.8302,
+      "step": 58
+    },
+    {
+      "epoch": 0.003386336866902238,
+      "grad_norm": 0.2387794405221939,
+      "learning_rate": 0.000592658477444273,
+      "loss": 2.835,
+      "step": 59
+    },
+    {
+      "epoch": 0.003443732407019225,
+      "grad_norm": 0.2748169004917145,
+      "learning_rate": 0.0005923583620516003,
+      "loss": 2.834,
+      "step": 60
+    },
+    {
+      "epoch": 0.003501127947136212,
+      "grad_norm": 0.2565017640590668,
+      "learning_rate": 0.0005920523948642431,
+      "loss": 2.8452,
+      "step": 61
+    },
+    {
+      "epoch": 0.0035585234872531992,
+      "grad_norm": 0.25502678751945496,
+      "learning_rate": 0.0005917405884593144,
+      "loss": 2.8345,
+      "step": 62
+    },
+    {
+      "epoch": 0.0036159190273701864,
+      "grad_norm": 0.22830121219158173,
+      "learning_rate": 0.0005914229556539538,
+      "loss": 2.7989,
+      "step": 63
+    },
+    {
+      "epoch": 0.003673314567487173,
+      "grad_norm": 0.3146669268608093,
+      "learning_rate": 0.0005910995095048024,
+      "loss": 2.845,
+      "step": 64
+    },
+    {
+      "epoch": 0.0037307101076041603,
+      "grad_norm": 0.2924383580684662,
+      "learning_rate": 0.000590770263307464,
+      "loss": 2.8303,
+      "step": 65
+    },
+    {
+      "epoch": 0.0037881056477211475,
+      "grad_norm": 0.2577711343765259,
+      "learning_rate": 0.0005904352305959605,
+      "loss": 2.8156,
+      "step": 66
+    },
+    {
+      "epoch": 0.0038455011878381347,
+      "grad_norm": 0.2631978988647461,
+      "learning_rate": 0.0005900944251421745,
+      "loss": 2.833,
+      "step": 67
+    },
+    {
+      "epoch": 0.003902896727955122,
+      "grad_norm": 0.21994397044181824,
+      "learning_rate": 0.000589747860955283,
+      "loss": 2.8136,
+      "step": 68
+    },
+    {
+      "epoch": 0.003960292268072109,
+      "grad_norm": 0.3000943064689636,
+      "learning_rate": 0.0005893955522811827,
+      "loss": 2.8415,
+      "step": 69
+    },
+    {
+      "epoch": 0.004017687808189096,
+      "grad_norm": 0.24310976266860962,
+      "learning_rate": 0.0005890375136019032,
+      "loss": 2.8148,
+      "step": 70
+    },
+    {
+      "epoch": 0.004075083348306083,
+      "grad_norm": 0.24616850912570953,
+      "learning_rate": 0.0005886737596350122,
+      "loss": 2.8329,
+      "step": 71
+    },
+    {
+      "epoch": 0.00413247888842307,
+      "grad_norm": 0.2714521884918213,
+      "learning_rate": 0.0005883043053330105,
+      "loss": 2.8356,
+      "step": 72
+    },
+    {
+      "epoch": 0.004189874428540057,
+      "grad_norm": 0.2601388096809387,
+      "learning_rate": 0.0005879291658827176,
+      "loss": 2.8228,
+      "step": 73
+    },
+    {
+      "epoch": 0.004247269968657044,
+      "grad_norm": 0.22764116525650024,
+      "learning_rate": 0.0005875483567046467,
+      "loss": 2.801,
+      "step": 74
+    },
+    {
+      "epoch": 0.004304665508774032,
+      "grad_norm": 0.22346433997154236,
+      "learning_rate": 0.0005871618934523719,
+      "loss": 2.7948,
+      "step": 75
+    },
+    {
+      "epoch": 0.004362061048891018,
+      "grad_norm": 0.18839874863624573,
+      "learning_rate": 0.0005867697920118835,
+      "loss": 2.8341,
+      "step": 76
+    },
+    {
+      "epoch": 0.004419456589008005,
+      "grad_norm": 0.25794312357902527,
+      "learning_rate": 0.0005863720685009362,
+      "loss": 2.815,
+      "step": 77
+    },
+    {
+      "epoch": 0.004476852129124993,
+      "grad_norm": 0.2352106124162674,
+      "learning_rate": 0.0005859687392683856,
+      "loss": 2.8169,
+      "step": 78
+    },
+    {
+      "epoch": 0.0045342476692419795,
+      "grad_norm": 0.28784099221229553,
+      "learning_rate": 0.0005855598208935169,
+      "loss": 2.8506,
+      "step": 79
+    },
+    {
+      "epoch": 0.004591643209358967,
+      "grad_norm": 0.22999855875968933,
+      "learning_rate": 0.0005851453301853628,
+      "loss": 2.8377,
+      "step": 80
+    },
+    {
+      "epoch": 0.004649038749475954,
+      "grad_norm": 0.21411263942718506,
+      "learning_rate": 0.0005847252841820128,
+      "loss": 2.8137,
+      "step": 81
+    },
+    {
+      "epoch": 0.0047064342895929406,
+      "grad_norm": 0.2420736700296402,
+      "learning_rate": 0.0005842997001499129,
+      "loss": 2.7929,
+      "step": 82
+    },
+    {
+      "epoch": 0.004763829829709928,
+      "grad_norm": 0.24426190555095673,
+      "learning_rate": 0.0005838685955831558,
+      "loss": 2.8273,
+      "step": 83
+    },
+    {
+      "epoch": 0.004821225369826915,
+      "grad_norm": 0.20297811925411224,
+      "learning_rate": 0.0005834319882027617,
+      "loss": 2.7993,
+      "step": 84
+    },
+    {
+      "epoch": 0.0048786209099439025,
+      "grad_norm": 0.2474389523267746,
+      "learning_rate": 0.00058298989595595,
+      "loss": 2.8252,
+      "step": 85
+    },
+    {
+      "epoch": 0.004936016450060889,
+      "grad_norm": 0.22601982951164246,
+      "learning_rate": 0.0005825423370154012,
+      "loss": 2.8421,
+      "step": 86
+    },
+    {
+      "epoch": 0.004993411990177876,
+      "grad_norm": 0.24997788667678833,
+      "learning_rate": 0.0005820893297785106,
+      "loss": 2.8485,
+      "step": 87
+    },
+    {
+      "epoch": 0.005050807530294864,
+      "grad_norm": 0.19994623959064484,
+      "learning_rate": 0.0005816308928666314,
+      "loss": 2.8456,
+      "step": 88
+    },
+    {
+      "epoch": 0.00510820307041185,
+      "grad_norm": 0.19206245243549347,
+      "learning_rate": 0.0005811670451243093,
+      "loss": 2.8035,
+      "step": 89
+    },
+    {
+      "epoch": 0.005165598610528837,
+      "grad_norm": 0.2515026032924652,
+      "learning_rate": 0.0005806978056185083,
+      "loss": 2.8232,
+      "step": 90
+    },
+    {
+      "epoch": 0.005222994150645825,
+      "grad_norm": 0.22921022772789001,
+      "learning_rate": 0.0005802231936378267,
+      "loss": 2.8366,
+      "step": 91
+    },
+    {
+      "epoch": 0.0052803896907628114,
+      "grad_norm": 0.248809352517128,
+      "learning_rate": 0.000579743228691704,
+      "loss": 2.8331,
+      "step": 92
+    },
+    {
+      "epoch": 0.005337785230879799,
+      "grad_norm": 0.18247073888778687,
+      "learning_rate": 0.0005792579305096191,
+      "loss": 2.8249,
+      "step": 93
+    },
+    {
+      "epoch": 0.005395180770996786,
+      "grad_norm": 0.2440440058708191,
+      "learning_rate": 0.0005787673190402799,
+      "loss": 2.837,
+      "step": 94
+    },
+    {
+      "epoch": 0.0054525763111137725,
+      "grad_norm": 0.21160444617271423,
+      "learning_rate": 0.0005782714144508019,
+      "loss": 2.7864,
+      "step": 95
+    },
+    {
+      "epoch": 0.00550997185123076,
+      "grad_norm": 0.21344538033008575,
+      "learning_rate": 0.0005777702371258806,
+      "loss": 2.847,
+      "step": 96
+    },
+    {
+      "epoch": 0.005567367391347747,
+      "grad_norm": 0.24861139059066772,
+      "learning_rate": 0.0005772638076669529,
+      "loss": 2.8267,
+      "step": 97
+    },
+    {
+      "epoch": 0.0056247629314647345,
+      "grad_norm": 0.290520042181015,
+      "learning_rate": 0.0005767521468913501,
+      "loss": 2.827,
+      "step": 98
+    },
+    {
+      "epoch": 0.005682158471581721,
+      "grad_norm": 0.20536312460899353,
+      "learning_rate": 0.0005762352758314429,
+      "loss": 2.8476,
+      "step": 99
+    },
+    {
+      "epoch": 0.005739554011698708,
+      "grad_norm": 0.21782469749450684,
+      "learning_rate": 0.000575713215733776,
+      "loss": 2.844,
+      "step": 100
+    },
+    {
+      "epoch": 0.005739554011698708,
+      "eval_loss": 2.7509028911590576,
+      "eval_runtime": 85.2068,
+      "eval_samples_per_second": 50.641,
+      "eval_steps_per_second": 12.663,
+      "step": 100
+    },
+    {
+      "epoch": 0.005796949551815696,
+      "grad_norm": 0.2523731291294098,
+      "learning_rate": 0.0005751859880581954,
+      "loss": 2.8125,
+      "step": 101
+    },
+    {
+      "epoch": 0.005854345091932682,
+      "grad_norm": 0.30107325315475464,
+      "learning_rate": 0.0005746536144769656,
+      "loss": 2.8108,
+      "step": 102
+    },
+    {
+      "epoch": 0.00591174063204967,
+      "grad_norm": 0.24103832244873047,
+      "learning_rate": 0.0005741161168738794,
+      "loss": 2.8282,
+      "step": 103
+    },
+    {
+      "epoch": 0.005969136172166657,
+      "grad_norm": 0.31273001432418823,
+      "learning_rate": 0.0005735735173433582,
+      "loss": 2.8104,
+      "step": 104
+    },
+    {
+      "epoch": 0.006026531712283643,
+      "grad_norm": 0.19059035181999207,
+      "learning_rate": 0.0005730258381895433,
+      "loss": 2.8186,
+      "step": 105
+    },
+    {
+      "epoch": 0.006083927252400631,
+      "grad_norm": 0.25082021951675415,
+      "learning_rate": 0.0005724731019253797,
+      "loss": 2.8154,
+      "step": 106
+    },
+    {
+      "epoch": 0.006141322792517618,
+      "grad_norm": 0.23254480957984924,
+      "learning_rate": 0.0005719153312716904,
+      "loss": 2.8121,
+      "step": 107
+    },
+    {
+      "epoch": 0.006198718332634605,
+      "grad_norm": 0.24095705151557922,
+      "learning_rate": 0.0005713525491562421,
+      "loss": 2.8361,
+      "step": 108
+    },
+    {
+      "epoch": 0.006256113872751592,
+      "grad_norm": 0.17760275304317474,
+      "learning_rate": 0.0005707847787128034,
+      "loss": 2.8396,
+      "step": 109
+    },
+    {
+      "epoch": 0.006313509412868579,
+      "grad_norm": 0.20905229449272156,
+      "learning_rate": 0.0005702120432801934,
+      "loss": 2.8284,
+      "step": 110
+    },
+    {
+      "epoch": 0.0063709049529855665,
+      "grad_norm": 0.19538630545139313,
+      "learning_rate": 0.0005696343664013227,
+      "loss": 2.8417,
+      "step": 111
+    },
+    {
+      "epoch": 0.006428300493102553,
+      "grad_norm": 0.2408672571182251,
+      "learning_rate": 0.0005690517718222248,
+      "loss": 2.8416,
+      "step": 112
+    },
+    {
+      "epoch": 0.006485696033219541,
+      "grad_norm": 0.19618412852287292,
+      "learning_rate": 0.0005684642834910813,
+      "loss": 2.8683,
+      "step": 113
+    },
+    {
+      "epoch": 0.006543091573336528,
+      "grad_norm": 0.17854906618595123,
+      "learning_rate": 0.0005678719255572363,
+      "loss": 2.8232,
+      "step": 114
+    },
+    {
+      "epoch": 0.006600487113453514,
+      "grad_norm": 0.2527766227722168,
+      "learning_rate": 0.0005672747223702044,
+      "loss": 2.8219,
+      "step": 115
+    },
+    {
+      "epoch": 0.006657882653570502,
+      "grad_norm": 0.21465440094470978,
+      "learning_rate": 0.0005666726984786695,
+      "loss": 2.8308,
+      "step": 116
+    },
+    {
+      "epoch": 0.006715278193687489,
+      "grad_norm": 0.2080729454755783,
+      "learning_rate": 0.000566065878629476,
+      "loss": 2.8369,
+      "step": 117
+    },
+    {
+      "epoch": 0.006772673733804476,
+      "grad_norm": 0.18979360163211823,
+      "learning_rate": 0.0005654542877666108,
+      "loss": 2.7997,
+      "step": 118
+    },
+    {
+      "epoch": 0.006830069273921463,
+      "grad_norm": 0.20258580148220062,
+      "learning_rate": 0.0005648379510301792,
+      "loss": 2.846,
+      "step": 119
+    },
+    {
+      "epoch": 0.00688746481403845,
+      "grad_norm": 0.2112026810646057,
+      "learning_rate": 0.0005642168937553701,
+      "loss": 2.8521,
+      "step": 120
+    },
+    {
+      "epoch": 0.006944860354155437,
+      "grad_norm": 0.25105029344558716,
+      "learning_rate": 0.0005635911414714158,
+      "loss": 2.8081,
+      "step": 121
+    },
+    {
+      "epoch": 0.007002255894272424,
+      "grad_norm": 0.21830224990844727,
+      "learning_rate": 0.0005629607199005416,
+      "loss": 2.8161,
+      "step": 122
+    },
+    {
+      "epoch": 0.007059651434389411,
+      "grad_norm": 0.19216330349445343,
+      "learning_rate": 0.0005623256549569091,
+      "loss": 2.805,
+      "step": 123
+    },
+    {
+      "epoch": 0.0071170469745063985,
+      "grad_norm": 0.19969609379768372,
+      "learning_rate": 0.000561685972745551,
+      "loss": 2.7859,
+      "step": 124
+    },
+    {
+      "epoch": 0.007174442514623385,
+      "grad_norm": 0.22093947231769562,
+      "learning_rate": 0.0005610416995612973,
+      "loss": 2.8194,
+      "step": 125
+    },
+    {
+      "epoch": 0.007231838054740373,
+      "grad_norm": 0.2148187905550003,
+      "learning_rate": 0.0005603928618876952,
+      "loss": 2.8565,
+      "step": 126
+    },
+    {
+      "epoch": 0.0072892335948573595,
+      "grad_norm": 0.18277674913406372,
+      "learning_rate": 0.0005597394863959201,
+      "loss": 2.8187,
+      "step": 127
+    },
+    {
+      "epoch": 0.007346629134974346,
+      "grad_norm": 0.22607837617397308,
+      "learning_rate": 0.0005590815999436795,
+      "loss": 2.8607,
+      "step": 128
+    },
+    {
+      "epoch": 0.007404024675091334,
+      "grad_norm": 0.22417186200618744,
+      "learning_rate": 0.0005584192295741086,
+      "loss": 2.8198,
+      "step": 129
+    },
+    {
+      "epoch": 0.007461420215208321,
+      "grad_norm": 0.229670912027359,
+      "learning_rate": 0.0005577524025146591,
+      "loss": 2.8477,
+      "step": 130
+    },
+    {
+      "epoch": 0.007518815755325308,
+      "grad_norm": 0.1985808163881302,
+      "learning_rate": 0.0005570811461759794,
+      "loss": 2.8058,
+      "step": 131
+    },
+    {
+      "epoch": 0.007576211295442295,
+      "grad_norm": 0.22260330617427826,
+      "learning_rate": 0.0005564054881507886,
+      "loss": 2.8369,
+      "step": 132
+    },
+    {
+      "epoch": 0.007633606835559282,
+      "grad_norm": 0.20925524830818176,
+      "learning_rate": 0.0005557254562127417,
+      "loss": 2.8205,
+      "step": 133
+    },
+    {
+      "epoch": 0.007691002375676269,
+      "grad_norm": 0.26581674814224243,
+      "learning_rate": 0.0005550410783152882,
+      "loss": 2.8164,
+      "step": 134
+    },
+    {
+      "epoch": 0.007748397915793256,
+      "grad_norm": 0.2182077318429947,
+      "learning_rate": 0.0005543523825905229,
+      "loss": 2.8279,
+      "step": 135
+    },
+    {
+      "epoch": 0.007805793455910244,
+      "grad_norm": 0.24468722939491272,
+      "learning_rate": 0.0005536593973480297,
+      "loss": 2.8281,
+      "step": 136
+    },
+    {
+      "epoch": 0.007863188996027231,
+      "grad_norm": 0.22021321952342987,
+      "learning_rate": 0.0005529621510737175,
+      "loss": 2.8028,
+      "step": 137
+    },
+    {
+      "epoch": 0.007920584536144217,
+      "grad_norm": 0.20566654205322266,
+      "learning_rate": 0.0005522606724286498,
+      "loss": 2.7937,
+      "step": 138
+    },
+    {
+      "epoch": 0.007977980076261205,
+      "grad_norm": 0.1960543841123581,
+      "learning_rate": 0.0005515549902478665,
+      "loss": 2.8089,
+      "step": 139
+    },
+    {
+      "epoch": 0.008035375616378192,
+      "grad_norm": 0.2689999043941498,
+      "learning_rate": 0.0005508451335391975,
+      "loss": 2.7959,
+      "step": 140
+    },
+    {
+      "epoch": 0.008092771156495178,
+      "grad_norm": 0.19776718318462372,
+      "learning_rate": 0.0005501311314820721,
+      "loss": 2.8442,
+      "step": 141
+    },
+    {
+      "epoch": 0.008150166696612166,
+      "grad_norm": 0.2156287282705307,
+      "learning_rate": 0.0005494130134263184,
+      "loss": 2.8224,
+      "step": 142
+    },
+    {
+      "epoch": 0.008207562236729153,
+      "grad_norm": 0.17528703808784485,
+      "learning_rate": 0.0005486908088909568,
+      "loss": 2.8659,
+      "step": 143
+    },
+    {
+      "epoch": 0.00826495777684614,
+      "grad_norm": 0.1757359504699707,
+      "learning_rate": 0.0005479645475629872,
+      "loss": 2.8119,
+      "step": 144
+    },
+    {
+      "epoch": 0.008322353316963127,
+      "grad_norm": 0.1916513890028,
+      "learning_rate": 0.0005472342592961683,
+      "loss": 2.8069,
+      "step": 145
+    },
+    {
+      "epoch": 0.008379748857080115,
+      "grad_norm": 0.19162799417972565,
+      "learning_rate": 0.0005464999741097901,
+      "loss": 2.8211,
+      "step": 146
+    },
+    {
+      "epoch": 0.0084371443971971,
+      "grad_norm": 0.1881379634141922,
+      "learning_rate": 0.0005457617221874408,
+      "loss": 2.7954,
+      "step": 147
+    },
+    {
+      "epoch": 0.008494539937314088,
+      "grad_norm": 0.22305060923099518,
+      "learning_rate": 0.0005450195338757654,
+      "loss": 2.8447,
+      "step": 148
+    },
+    {
+      "epoch": 0.008551935477431076,
+      "grad_norm": 0.25081732869148254,
+      "learning_rate": 0.0005442734396832185,
+      "loss": 2.8205,
+      "step": 149
+    },
+    {
+      "epoch": 0.008609331017548063,
+      "grad_norm": 0.24046167731285095,
+      "learning_rate": 0.00054352347027881,
+      "loss": 2.8246,
+      "step": 150
+    },
+    {
+      "epoch": 0.00866672655766505,
+      "grad_norm": 0.20985569059848785,
+      "learning_rate": 0.0005427696564908447,
+      "loss": 2.8384,
+      "step": 151
+    },
+    {
+      "epoch": 0.008724122097782037,
+      "grad_norm": 0.18979063630104065,
+      "learning_rate": 0.000542012029305655,
+      "loss": 2.8261,
+      "step": 152
+    },
+    {
+      "epoch": 0.008781517637899024,
+      "grad_norm": 0.21513347327709198,
+      "learning_rate": 0.0005412506198663268,
+      "loss": 2.8197,
+      "step": 153
+    },
+    {
+      "epoch": 0.00883891317801601,
+      "grad_norm": 0.25432831048965454,
+      "learning_rate": 0.0005404854594714204,
+      "loss": 2.8091,
+      "step": 154
+    },
+    {
+      "epoch": 0.008896308718132998,
+      "grad_norm": 0.261273592710495,
+      "learning_rate": 0.0005397165795736823,
+      "loss": 2.8324,
+      "step": 155
+    },
+    {
+      "epoch": 0.008953704258249985,
+      "grad_norm": 0.22144336998462677,
+      "learning_rate": 0.0005389440117787538,
+      "loss": 2.8459,
+      "step": 156
+    },
+    {
+      "epoch": 0.009011099798366971,
+      "grad_norm": 0.1860560178756714,
+      "learning_rate": 0.000538167787843871,
+      "loss": 2.8552,
+      "step": 157
+    },
+    {
+      "epoch": 0.009068495338483959,
+      "grad_norm": 0.2402401566505432,
+      "learning_rate": 0.0005373879396765593,
+      "loss": 2.8229,
+      "step": 158
+    },
+    {
+      "epoch": 0.009125890878600947,
+      "grad_norm": 0.2112584114074707,
+      "learning_rate": 0.0005366044993333228,
+      "loss": 2.823,
+      "step": 159
+    },
+    {
+      "epoch": 0.009183286418717934,
+      "grad_norm": 0.24757996201515198,
+      "learning_rate": 0.0005358174990183254,
+      "loss": 2.8458,
+      "step": 160
+    },
+    {
+      "epoch": 0.00924068195883492,
+      "grad_norm": 0.20984984934329987,
+      "learning_rate": 0.0005350269710820675,
+      "loss": 2.8375,
+      "step": 161
+    },
+    {
+      "epoch": 0.009298077498951908,
+      "grad_norm": 0.22329501807689667,
+      "learning_rate": 0.0005342329480200562,
+      "loss": 2.815,
+      "step": 162
+    },
+    {
+      "epoch": 0.009355473039068895,
+      "grad_norm": 0.26144203543663025,
+      "learning_rate": 0.0005334354624714697,
+      "loss": 2.8286,
+      "step": 163
+    },
+    {
+      "epoch": 0.009412868579185881,
+      "grad_norm": 0.20015327632427216,
+      "learning_rate": 0.0005326345472178154,
+      "loss": 2.8304,
+      "step": 164
+    },
+    {
+      "epoch": 0.009470264119302869,
+      "grad_norm": 0.29256758093833923,
+      "learning_rate": 0.0005318302351815823,
+      "loss": 2.7884,
+      "step": 165
+    },
+    {
+      "epoch": 0.009527659659419856,
+      "grad_norm": 0.22914084792137146,
+      "learning_rate": 0.000531022559424888,
+      "loss": 2.8253,
+      "step": 166
+    },
+    {
+      "epoch": 0.009585055199536842,
+      "grad_norm": 0.2677003741264343,
+      "learning_rate": 0.0005302115531481195,
+      "loss": 2.8084,
+      "step": 167
+    },
+    {
+      "epoch": 0.00964245073965383,
+      "grad_norm": 0.2672327756881714,
+      "learning_rate": 0.000529397249688568,
+      "loss": 2.8351,
+      "step": 168
+    },
+    {
+      "epoch": 0.009699846279770817,
+      "grad_norm": 0.21281464397907257,
+      "learning_rate": 0.0005285796825190598,
+      "loss": 2.8463,
+      "step": 169
+    },
+    {
+      "epoch": 0.009757241819887805,
+      "grad_norm": 0.22858156263828278,
+      "learning_rate": 0.0005277588852465788,
+      "loss": 2.8156,
+      "step": 170
+    },
+    {
+      "epoch": 0.009814637360004791,
+      "grad_norm": 0.20694582164287567,
+      "learning_rate": 0.0005269348916108859,
+      "loss": 2.8392,
+      "step": 171
+    },
+    {
+      "epoch": 0.009872032900121779,
+      "grad_norm": 0.22438685595989227,
+      "learning_rate": 0.0005261077354831322,
+      "loss": 2.8336,
+      "step": 172
+    },
+    {
+      "epoch": 0.009929428440238766,
+      "grad_norm": 0.2279587984085083,
+      "learning_rate": 0.0005252774508644666,
+      "loss": 2.7972,
+      "step": 173
+    },
+    {
+      "epoch": 0.009986823980355752,
+      "grad_norm": 0.21278439462184906,
+      "learning_rate": 0.0005244440718846375,
+      "loss": 2.7946,
+      "step": 174
+    },
+    {
+      "epoch": 0.01004421952047274,
+      "grad_norm": 0.23399871587753296,
+      "learning_rate": 0.0005236076328005906,
+      "loss": 2.8648,
+      "step": 175
+    },
+    {
+      "epoch": 0.010101615060589727,
+      "grad_norm": 0.2649572193622589,
+      "learning_rate": 0.0005227681679950607,
+      "loss": 2.8453,
+      "step": 176
+    },
+    {
+      "epoch": 0.010159010600706713,
+      "grad_norm": 0.21067285537719727,
+      "learning_rate": 0.0005219257119751581,
+      "loss": 2.8357,
+      "step": 177
+    },
+    {
+      "epoch": 0.0102164061408237,
+      "grad_norm": 0.22862860560417175,
+      "learning_rate": 0.0005210802993709497,
+      "loss": 2.8235,
+      "step": 178
+    },
+    {
+      "epoch": 0.010273801680940688,
+      "grad_norm": 0.22179283201694489,
+      "learning_rate": 0.0005202319649340369,
+      "loss": 2.82,
+      "step": 179
+    },
+    {
+      "epoch": 0.010331197221057674,
+      "grad_norm": 0.16690605878829956,
+      "learning_rate": 0.0005193807435361252,
+      "loss": 2.8237,
+      "step": 180
+    },
+    {
+      "epoch": 0.010388592761174662,
+      "grad_norm": 0.21572506427764893,
+      "learning_rate": 0.0005185266701675927,
+      "loss": 2.8403,
+      "step": 181
+    },
+    {
+      "epoch": 0.01044598830129165,
+      "grad_norm": 0.1778525710105896,
+      "learning_rate": 0.0005176697799360502,
+      "loss": 2.8204,
+      "step": 182
+    },
+    {
+      "epoch": 0.010503383841408637,
+      "grad_norm": 0.18887534737586975,
+      "learning_rate": 0.0005168101080648989,
+      "loss": 2.8146,
+      "step": 183
+    },
+    {
+      "epoch": 0.010560779381525623,
+      "grad_norm": 0.18108077347278595,
+      "learning_rate": 0.0005159476898918823,
+      "loss": 2.853,
+      "step": 184
+    },
+    {
+      "epoch": 0.01061817492164261,
+      "grad_norm": 0.1870754212141037,
+      "learning_rate": 0.0005150825608676336,
+      "loss": 2.8537,
+      "step": 185
+    },
+    {
+      "epoch": 0.010675570461759598,
+      "grad_norm": 0.16484060883522034,
+      "learning_rate": 0.0005142147565542188,
+      "loss": 2.8194,
+      "step": 186
+    },
+    {
+      "epoch": 0.010732966001876584,
+      "grad_norm": 0.18527449667453766,
+      "learning_rate": 0.0005133443126236739,
+      "loss": 2.8402,
+      "step": 187
+    },
+    {
+      "epoch": 0.010790361541993572,
+      "grad_norm": 0.17674389481544495,
+      "learning_rate": 0.0005124712648565398,
+      "loss": 2.8412,
+      "step": 188
+    },
+    {
+      "epoch": 0.01084775708211056,
+      "grad_norm": 0.2521503269672394,
+      "learning_rate": 0.0005115956491403907,
+      "loss": 2.8348,
+      "step": 189
+    },
+    {
+      "epoch": 0.010905152622227545,
+      "grad_norm": 0.17621657252311707,
+      "learning_rate": 0.000510717501468359,
+      "loss": 2.8293,
+      "step": 190
+    },
+    {
+      "epoch": 0.010962548162344533,
+      "grad_norm": 0.2621336579322815,
+      "learning_rate": 0.0005098368579376563,
+      "loss": 2.8164,
+      "step": 191
+    },
+    {
+      "epoch": 0.01101994370246152,
+      "grad_norm": 0.18950189650058746,
+      "learning_rate": 0.0005089537547480885,
+      "loss": 2.7976,
+      "step": 192
+    },
+    {
+      "epoch": 0.011077339242578508,
+      "grad_norm": 0.24857239425182343,
+      "learning_rate": 0.0005080682282005692,
+      "loss": 2.8323,
+      "step": 193
+    },
+    {
+      "epoch": 0.011134734782695494,
+      "grad_norm": 0.16708490252494812,
+      "learning_rate": 0.0005071803146956262,
+      "loss": 2.801,
+      "step": 194
+    },
+    {
+      "epoch": 0.011192130322812481,
+      "grad_norm": 0.24443359673023224,
+      "learning_rate": 0.000506290050731906,
+      "loss": 2.8121,
+      "step": 195
+    },
+    {
+      "epoch": 0.011249525862929469,
+      "grad_norm": 0.2458924949169159,
+      "learning_rate": 0.0005053974729046734,
+      "loss": 2.8325,
+      "step": 196
+    },
+    {
+      "epoch": 0.011306921403046455,
+      "grad_norm": 0.2034812569618225,
+      "learning_rate": 0.0005045026179043067,
+      "loss": 2.8123,
+      "step": 197
+    },
+    {
+      "epoch": 0.011364316943163442,
+      "grad_norm": 0.2774895429611206,
+      "learning_rate": 0.0005036055225147901,
+      "loss": 2.8324,
+      "step": 198
+    },
+    {
+      "epoch": 0.01142171248328043,
+      "grad_norm": 0.22201013565063477,
+      "learning_rate": 0.0005027062236122014,
+      "loss": 2.8195,
+      "step": 199
+    },
+    {
+      "epoch": 0.011479108023397416,
+      "grad_norm": 0.1893691122531891,
+      "learning_rate": 0.0005018047581631961,
+      "loss": 2.8177,
+      "step": 200
+    },
+    {
+      "epoch": 0.011479108023397416,
+      "eval_loss": 2.749150037765503,
+      "eval_runtime": 85.2258,
+      "eval_samples_per_second": 50.63,
+      "eval_steps_per_second": 12.66,
+      "step": 200
+    },
+    {
+      "epoch": 0.011536503563514404,
+      "grad_norm": 0.2689765691757202,
+      "learning_rate": 0.0005009011632234881,
+      "loss": 2.8438,
+      "step": 201
+    },
+    {
+      "epoch": 0.011593899103631391,
+      "grad_norm": 0.2234533727169037,
+      "learning_rate": 0.0004999954759363262,
+      "loss": 2.8103,
+      "step": 202
+    },
+    {
+      "epoch": 0.011651294643748379,
+      "grad_norm": 0.25140801072120667,
+      "learning_rate": 0.0004990877335309675,
+      "loss": 2.8178,
+      "step": 203
+    },
+    {
+      "epoch": 0.011708690183865365,
+      "grad_norm": 0.3070688843727112,
+      "learning_rate": 0.0004981779733211468,
+      "loss": 2.8518,
+      "step": 204
+    },
+    {
+      "epoch": 0.011766085723982352,
+      "grad_norm": 0.25637757778167725,
+      "learning_rate": 0.0004972662327035431,
+      "loss": 2.8578,
+      "step": 205
+    },
+    {
+      "epoch": 0.01182348126409934,
+      "grad_norm": 0.2551119923591614,
+      "learning_rate": 0.0004963525491562421,
+      "loss": 2.8237,
+      "step": 206
+    },
+    {
+      "epoch": 0.011880876804216326,
+      "grad_norm": 0.2416735738515854,
+      "learning_rate": 0.0004954369602371958,
+      "loss": 2.8195,
+      "step": 207
+    },
+    {
+      "epoch": 0.011938272344333313,
+      "grad_norm": 0.3950039744377136,
+      "learning_rate": 0.0004945195035826785,
+      "loss": 2.8087,
+      "step": 208
+    },
+    {
+      "epoch": 0.011995667884450301,
+      "grad_norm": 0.16370531916618347,
+      "learning_rate": 0.00049360021690574,
+      "loss": 2.8464,
+      "step": 209
+    },
+    {
+      "epoch": 0.012053063424567287,
+      "grad_norm": 0.28070008754730225,
+      "learning_rate": 0.0004926791379946549,
+      "loss": 2.8377,
+      "step": 210
+    },
+    {
+      "epoch": 0.012110458964684274,
+      "grad_norm": 0.1902085244655609,
+      "learning_rate": 0.0004917563047113695,
+      "loss": 2.8279,
+      "step": 211
+    },
+    {
+      "epoch": 0.012167854504801262,
+      "grad_norm": 0.27748385071754456,
+      "learning_rate": 0.0004908317549899456,
+      "loss": 2.837,
+      "step": 212
+    },
+    {
+      "epoch": 0.012225250044918248,
+      "grad_norm": 0.18437190353870392,
+      "learning_rate": 0.0004899055268350012,
+      "loss": 2.8301,
+      "step": 213
+    },
+    {
+      "epoch": 0.012282645585035236,
+      "grad_norm": 0.22971947491168976,
+      "learning_rate": 0.0004889776583201479,
+      "loss": 2.8051,
+      "step": 214
+    },
+    {
+      "epoch": 0.012340041125152223,
+      "grad_norm": 0.238089457154274,
+      "learning_rate": 0.0004880481875864261,
+      "loss": 2.8162,
+      "step": 215
+    },
+    {
+      "epoch": 0.01239743666526921,
+      "grad_norm": 0.24253320693969727,
+      "learning_rate": 0.0004871171528407371,
+      "loss": 2.8181,
+      "step": 216
+    },
+    {
+      "epoch": 0.012454832205386197,
+      "grad_norm": 0.2351958006620407,
+      "learning_rate": 0.0004861845923542728,
+      "loss": 2.8136,
+      "step": 217
+    },
+    {
+      "epoch": 0.012512227745503184,
+      "grad_norm": 0.23203608393669128,
+      "learning_rate": 0.0004852505444609422,
+      "loss": 2.804,
+      "step": 218
+    },
+    {
+      "epoch": 0.012569623285620172,
+      "grad_norm": 0.1896822452545166,
+      "learning_rate": 0.00048431504755579575,
+      "loss": 2.8118,
+      "step": 219
+    },
+    {
+      "epoch": 0.012627018825737158,
+      "grad_norm": 0.18357349932193756,
+      "learning_rate": 0.0004833781400934471,
+      "loss": 2.8205,
+      "step": 220
+    },
+    {
+      "epoch": 0.012684414365854145,
+      "grad_norm": 0.23723295331001282,
+      "learning_rate": 0.00048243986058649246,
+      "loss": 2.8291,
+      "step": 221
+    },
+    {
+      "epoch": 0.012741809905971133,
+      "grad_norm": 0.1937919706106186,
+      "learning_rate": 0.0004815002476039273,
+      "loss": 2.8416,
+      "step": 222
+    },
+    {
+      "epoch": 0.012799205446088119,
+      "grad_norm": 0.19754467904567719,
+      "learning_rate": 0.0004805593397695613,
+      "loss": 2.7963,
+      "step": 223
+    },
+    {
+      "epoch": 0.012856600986205106,
+      "grad_norm": 0.1592610776424408,
+      "learning_rate": 0.00047961717576043,
+      "loss": 2.8264,
+      "step": 224
+    },
+    {
+      "epoch": 0.012913996526322094,
+      "grad_norm": 0.2083783745765686,
+      "learning_rate": 0.00047867379430520585,
+      "loss": 2.8348,
+      "step": 225
+    },
+    {
+      "epoch": 0.012971392066439082,
+      "grad_norm": 0.1895647495985031,
+      "learning_rate": 0.00047772923418260525,
+      "loss": 2.8212,
+      "step": 226
+    },
+    {
+      "epoch": 0.013028787606556068,
+      "grad_norm": 0.2173570841550827,
+      "learning_rate": 0.0004767835342197954,
+      "loss": 2.8098,
+      "step": 227
+    },
+    {
+      "epoch": 0.013086183146673055,
+      "grad_norm": 0.1693475991487503,
+      "learning_rate": 0.0004758367332907978,
+      "loss": 2.796,
+      "step": 228
+    },
+    {
+      "epoch": 0.013143578686790043,
+      "grad_norm": 0.21635355055332184,
+      "learning_rate": 0.00047488887031489017,
+      "loss": 2.843,
+      "step": 229
+    },
+    {
+      "epoch": 0.013200974226907029,
+      "grad_norm": 0.18521156907081604,
+      "learning_rate": 0.0004739399842550068,
+      "loss": 2.8296,
+      "step": 230
+    },
+    {
+      "epoch": 0.013258369767024016,
+      "grad_norm": 0.22925664484500885,
+      "learning_rate": 0.00047299011411613734,
+      "loss": 2.8287,
+      "step": 231
+    },
+    {
+      "epoch": 0.013315765307141004,
+      "grad_norm": 0.24881386756896973,
+      "learning_rate": 0.00047203929894372264,
+      "loss": 2.8257,
+      "step": 232
+    },
+    {
+      "epoch": 0.01337316084725799,
+      "grad_norm": 0.20801618695259094,
+      "learning_rate": 0.00047108757782205043,
+      "loss": 2.8241,
+      "step": 233
+    },
+    {
+      "epoch": 0.013430556387374977,
+      "grad_norm": 0.199665367603302,
+      "learning_rate": 0.0004701349898726483,
+      "loss": 2.7916,
+      "step": 234
+    },
+    {
+      "epoch": 0.013487951927491965,
+      "grad_norm": 0.25221607089042664,
+      "learning_rate": 0.00046918157425267584,
+      "loss": 2.8233,
+      "step": 235
+    },
+    {
+      "epoch": 0.013545347467608953,
+      "grad_norm": 0.1931813657283783,
+      "learning_rate": 0.00046822737015331505,
+      "loss": 2.8016,
+      "step": 236
+    },
+    {
+      "epoch": 0.013602743007725938,
+      "grad_norm": 0.17353369295597076,
+      "learning_rate": 0.00046727241679815894,
+      "loss": 2.8125,
+      "step": 237
+    },
+    {
+      "epoch": 0.013660138547842926,
+      "grad_norm": 0.22225958108901978,
+      "learning_rate": 0.0004663167534415996,
+      "loss": 2.824,
+      "step": 238
+    },
+    {
+      "epoch": 0.013717534087959914,
+      "grad_norm": 0.17010116577148438,
+      "learning_rate": 0.0004653604193672147,
+      "loss": 2.8425,
+      "step": 239
+    },
+    {
+      "epoch": 0.0137749296280769,
+      "grad_norm": 0.2103683203458786,
+      "learning_rate": 0.00046440345388615225,
+      "loss": 2.8641,
+      "step": 240
+    },
+    {
+      "epoch": 0.013832325168193887,
+      "grad_norm": 0.17934557795524597,
+      "learning_rate": 0.00046344589633551497,
+      "loss": 2.8069,
+      "step": 241
+    },
+    {
+      "epoch": 0.013889720708310875,
+      "grad_norm": 0.2116999328136444,
+      "learning_rate": 0.0004624877860767434,
+      "loss": 2.8601,
+      "step": 242
+    },
+    {
+      "epoch": 0.01394711624842786,
+      "grad_norm": 0.20861205458641052,
+      "learning_rate": 0.0004615291624939975,
+      "loss": 2.8232,
+      "step": 243
+    },
+    {
+      "epoch": 0.014004511788544848,
+      "grad_norm": 0.24393285810947418,
+      "learning_rate": 0.0004605700649925381,
+      "loss": 2.8041,
+      "step": 244
+    },
+    {
+      "epoch": 0.014061907328661836,
+      "grad_norm": 0.2089577168226242,
+      "learning_rate": 0.0004596105329971069,
+      "loss": 2.8351,
+      "step": 245
+    },
+    {
+      "epoch": 0.014119302868778822,
+      "grad_norm": 0.20232421159744263,
+      "learning_rate": 0.00045865060595030616,
+      "loss": 2.8171,
+      "step": 246
+    },
+    {
+      "epoch": 0.01417669840889581,
+      "grad_norm": 0.22081732749938965,
+      "learning_rate": 0.00045769032331097686,
+      "loss": 2.8202,
+      "step": 247
+    },
+    {
+      "epoch": 0.014234093949012797,
+      "grad_norm": 0.17081516981124878,
+      "learning_rate": 0.00045672972455257723,
+      "loss": 2.8358,
+      "step": 248
+    },
+    {
+      "epoch": 0.014291489489129785,
+      "grad_norm": 0.3317008316516876,
+      "learning_rate": 0.0004557688491615597,
+      "loss": 2.8302,
+      "step": 249
+    },
+    {
+      "epoch": 0.01434888502924677,
+      "grad_norm": 0.23239760100841522,
+      "learning_rate": 0.0004548077366357483,
+      "loss": 2.8191,
+      "step": 250
+    },
+    {
+      "epoch": 0.014406280569363758,
+      "grad_norm": 0.22138993442058563,
+      "learning_rate": 0.0004538464264827143,
+      "loss": 2.8096,
+      "step": 251
+    },
+    {
+      "epoch": 0.014463676109480746,
+      "grad_norm": 0.23655574023723602,
+      "learning_rate": 0.000452884958218153,
+      "loss": 2.8295,
+      "step": 252
+    },
+    {
+      "epoch": 0.014521071649597731,
+      "grad_norm": 0.2227945327758789,
+      "learning_rate": 0.000451923371364259,
+      "loss": 2.8158,
+      "step": 253
+    },
+    {
+      "epoch": 0.014578467189714719,
+      "grad_norm": 0.20443300902843475,
+      "learning_rate": 0.0004509617054481017,
+      "loss": 2.83,
+      "step": 254
+    },
+    {
+      "epoch": 0.014635862729831707,
+      "grad_norm": 0.22221451997756958,
+      "learning_rate": 0.00045,
+      "loss": 2.8253,
+      "step": 255
+    },
+    {
+      "epoch": 0.014693258269948693,
+      "grad_norm": 0.1941068023443222,
+      "learning_rate": 0.00044903829455189825,
+      "loss": 2.83,
+      "step": 256
+    },
+    {
+      "epoch": 0.01475065381006568,
+      "grad_norm": 0.1914331614971161,
+      "learning_rate": 0.0004480766286357409,
+      "loss": 2.8162,
+      "step": 257
+    },
+    {
+      "epoch": 0.014808049350182668,
+      "grad_norm": 0.21014779806137085,
+      "learning_rate": 0.0004471150417818469,
+      "loss": 2.7993,
+      "step": 258
+    },
+    {
+      "epoch": 0.014865444890299655,
+      "grad_norm": 0.2057676762342453,
+      "learning_rate": 0.00044615357351728566,
+      "loss": 2.8223,
+      "step": 259
+    },
+    {
+      "epoch": 0.014922840430416641,
+      "grad_norm": 0.19875939190387726,
+      "learning_rate": 0.00044519226336425165,
+      "loss": 2.8016,
+      "step": 260
+    },
+    {
+      "epoch": 0.014980235970533629,
+      "grad_norm": 0.23691999912261963,
+      "learning_rate": 0.0004442311508384402,
+      "loss": 2.8373,
+      "step": 261
+    },
+    {
+      "epoch": 0.015037631510650616,
+      "grad_norm": 0.1729947328567505,
+      "learning_rate": 0.0004432702754474228,
+      "loss": 2.8233,
+      "step": 262
+    },
+    {
+      "epoch": 0.015095027050767602,
+      "grad_norm": 0.18821187317371368,
+      "learning_rate": 0.00044230967668902306,
+      "loss": 2.8128,
+      "step": 263
+    },
+    {
+      "epoch": 0.01515242259088459,
+      "grad_norm": 0.2283882200717926,
+      "learning_rate": 0.00044134939404969387,
+      "loss": 2.8178,
+      "step": 264
+    },
+    {
+      "epoch": 0.015209818131001578,
+      "grad_norm": 0.16724412143230438,
+      "learning_rate": 0.000440389467002893,
+      "loss": 2.8249,
+      "step": 265
+    },
+    {
+      "epoch": 0.015267213671118563,
+      "grad_norm": 0.18209712207317352,
+      "learning_rate": 0.00043942993500746183,
+      "loss": 2.8095,
+      "step": 266
+    },
+    {
+      "epoch": 0.015324609211235551,
+      "grad_norm": 0.1857995092868805,
+      "learning_rate": 0.00043847083750600253,
+      "loss": 2.806,
+      "step": 267
+    },
+    {
+      "epoch": 0.015382004751352539,
+      "grad_norm": 0.20734605193138123,
+      "learning_rate": 0.0004375122139232566,
+      "loss": 2.8695,
+      "step": 268
+    },
+    {
+      "epoch": 0.015439400291469526,
+      "grad_norm": 0.23138895630836487,
+      "learning_rate": 0.00043655410366448495,
+      "loss": 2.8033,
+      "step": 269
+    },
+    {
+      "epoch": 0.015496795831586512,
+      "grad_norm": 0.20481987297534943,
+      "learning_rate": 0.0004355965461138477,
+      "loss": 2.8269,
+      "step": 270
+    },
+    {
+      "epoch": 0.0155541913717035,
+      "grad_norm": 0.2318529337644577,
+      "learning_rate": 0.00043463958063278524,
+      "loss": 2.8332,
+      "step": 271
+    },
+    {
+      "epoch": 0.015611586911820487,
+      "grad_norm": 0.2501411736011505,
+      "learning_rate": 0.00043368324655840035,
+      "loss": 2.8445,
+      "step": 272
+    },
+    {
+      "epoch": 0.015668982451937475,
+      "grad_norm": 0.26137158274650574,
+      "learning_rate": 0.0004327275832018411,
+      "loss": 2.8279,
+      "step": 273
+    },
+    {
+      "epoch": 0.015726377992054463,
+      "grad_norm": 0.19074887037277222,
+      "learning_rate": 0.0004317726298466849,
+      "loss": 2.8132,
+      "step": 274
+    },
+    {
+      "epoch": 0.015783773532171447,
+      "grad_norm": 0.26000818610191345,
+      "learning_rate": 0.0004308184257473241,
+      "loss": 2.8091,
+      "step": 275
+    },
+    {
+      "epoch": 0.015841169072288434,
+      "grad_norm": 0.16060984134674072,
+      "learning_rate": 0.0004298650101273517,
+      "loss": 2.8206,
+      "step": 276
+    },
+    {
+      "epoch": 0.015898564612405422,
+      "grad_norm": 0.284445583820343,
+      "learning_rate": 0.00042891242217794954,
+      "loss": 2.7867,
+      "step": 277
+    },
+    {
+      "epoch": 0.01595596015252241,
+      "grad_norm": 0.15903466939926147,
+      "learning_rate": 0.0004279607010562773,
+      "loss": 2.83,
+      "step": 278
+    },
+    {
+      "epoch": 0.016013355692639397,
+      "grad_norm": 0.24330751597881317,
+      "learning_rate": 0.0004270098858838626,
+      "loss": 2.817,
+      "step": 279
+    },
+    {
+      "epoch": 0.016070751232756385,
+      "grad_norm": 0.1687777042388916,
+      "learning_rate": 0.0004260600157449931,
+      "loss": 2.8112,
+      "step": 280
+    },
+    {
+      "epoch": 0.01612814677287337,
+      "grad_norm": 0.18230785429477692,
+      "learning_rate": 0.0004251111296851098,
+      "loss": 2.8394,
+      "step": 281
+    },
+    {
+      "epoch": 0.016185542312990357,
+      "grad_norm": 0.1889660507440567,
+      "learning_rate": 0.00042416326670920217,
+      "loss": 2.8109,
+      "step": 282
+    },
+    {
+      "epoch": 0.016242937853107344,
+      "grad_norm": 0.16135123372077942,
+      "learning_rate": 0.0004232164657802045,
+      "loss": 2.7953,
+      "step": 283
+    },
+    {
+      "epoch": 0.016300333393224332,
+      "grad_norm": 0.15787218511104584,
+      "learning_rate": 0.00042227076581739467,
+      "loss": 2.7921,
+      "step": 284
+    },
+    {
+      "epoch": 0.01635772893334132,
+      "grad_norm": 0.16313977539539337,
+      "learning_rate": 0.0004213262056947942,
+      "loss": 2.8107,
+      "step": 285
+    },
+    {
+      "epoch": 0.016415124473458307,
+      "grad_norm": 0.18806132674217224,
+      "learning_rate": 0.0004203828242395699,
+      "loss": 2.8451,
+      "step": 286
+    },
+    {
+      "epoch": 0.016472520013575295,
+      "grad_norm": 0.17279674112796783,
+      "learning_rate": 0.00041944066023043866,
+      "loss": 2.8333,
+      "step": 287
+    },
+    {
+      "epoch": 0.01652991555369228,
+      "grad_norm": 0.17451834678649902,
+      "learning_rate": 0.00041849975239607255,
+      "loss": 2.7798,
+      "step": 288
+    },
+    {
+      "epoch": 0.016587311093809266,
+      "grad_norm": 0.1943039745092392,
+      "learning_rate": 0.00041756013941350747,
+      "loss": 2.8011,
+      "step": 289
+    },
+    {
+      "epoch": 0.016644706633926254,
+      "grad_norm": 0.1578904092311859,
+      "learning_rate": 0.0004166218599065528,
+      "loss": 2.852,
+      "step": 290
+    },
+    {
+      "epoch": 0.01670210217404324,
+      "grad_norm": 0.20066620409488678,
+      "learning_rate": 0.0004156849524442042,
+      "loss": 2.7876,
+      "step": 291
+    },
+    {
+      "epoch": 0.01675949771416023,
+      "grad_norm": 0.18306495249271393,
+      "learning_rate": 0.0004147494555390577,
+      "loss": 2.817,
+      "step": 292
+    },
+    {
+      "epoch": 0.016816893254277217,
+      "grad_norm": 0.1622687727212906,
+      "learning_rate": 0.0004138154076457271,
+      "loss": 2.815,
+      "step": 293
+    },
+    {
+      "epoch": 0.0168742887943942,
+      "grad_norm": 0.2056518942117691,
+      "learning_rate": 0.0004128828471592628,
+      "loss": 2.8131,
+      "step": 294
+    },
+    {
+      "epoch": 0.01693168433451119,
+      "grad_norm": 0.17123937606811523,
+      "learning_rate": 0.00041195181241357383,
+      "loss": 2.8025,
+      "step": 295
+    },
+    {
+      "epoch": 0.016989079874628176,
+      "grad_norm": 0.2233334332704544,
+      "learning_rate": 0.00041102234167985204,
+      "loss": 2.8347,
+      "step": 296
+    },
+    {
+      "epoch": 0.017046475414745164,
+      "grad_norm": 0.20740529894828796,
+      "learning_rate": 0.0004100944731649987,
+      "loss": 2.8099,
+      "step": 297
+    },
+    {
+      "epoch": 0.01710387095486215,
+      "grad_norm": 0.20391066372394562,
+      "learning_rate": 0.0004091682450100543,
+      "loss": 2.8363,
+      "step": 298
+    },
+    {
+      "epoch": 0.01716126649497914,
+      "grad_norm": 0.17306548357009888,
+      "learning_rate": 0.0004082436952886305,
+      "loss": 2.8211,
+      "step": 299
+    },
+    {
+      "epoch": 0.017218662035096127,
+      "grad_norm": 0.24933576583862305,
+      "learning_rate": 0.0004073208620053451,
+      "loss": 2.8048,
+      "step": 300
+    },
+    {
+      "epoch": 0.017218662035096127,
+      "eval_loss": 2.7432332038879395,
+      "eval_runtime": 85.2508,
+      "eval_samples_per_second": 50.615,
+      "eval_steps_per_second": 12.657,
+      "step": 300
+    },
+    {
+      "epoch": 0.01727605757521311,
+      "grad_norm": 0.231708824634552,
+      "learning_rate": 0.00040639978309425995,
+      "loss": 2.8025,
+      "step": 301
+    },
+    {
+      "epoch": 0.0173334531153301,
+      "grad_norm": 0.15970614552497864,
+      "learning_rate": 0.00040548049641732137,
+      "loss": 2.8392,
+      "step": 302
+    },
+    {
+      "epoch": 0.017390848655447086,
+      "grad_norm": 0.20457029342651367,
+      "learning_rate": 0.0004045630397628042,
+      "loss": 2.8247,
+      "step": 303
+    },
+    {
+      "epoch": 0.017448244195564074,
+      "grad_norm": 0.1734900325536728,
+      "learning_rate": 0.00040364745084375787,
+      "loss": 2.7979,
+      "step": 304
+    },
+    {
+      "epoch": 0.01750563973568106,
+      "grad_norm": 0.19265452027320862,
+      "learning_rate": 0.00040273376729645685,
+      "loss": 2.8033,
+      "step": 305
+    },
+    {
+      "epoch": 0.01756303527579805,
+      "grad_norm": 0.19174844026565552,
+      "learning_rate": 0.00040182202667885317,
+      "loss": 2.8354,
+      "step": 306
+    },
+    {
+      "epoch": 0.017620430815915036,
+      "grad_norm": 0.27793413400650024,
+      "learning_rate": 0.00040091226646903245,
+      "loss": 2.797,
+      "step": 307
+    },
+    {
+      "epoch": 0.01767782635603202,
+      "grad_norm": 0.1806309074163437,
+      "learning_rate": 0.00040000452406367367,
+      "loss": 2.8046,
+      "step": 308
+    },
+    {
+      "epoch": 0.017735221896149008,
+      "grad_norm": 0.2249089479446411,
+      "learning_rate": 0.0003990988367765118,
+      "loss": 2.8125,
+      "step": 309
+    },
+    {
+      "epoch": 0.017792617436265996,
+      "grad_norm": 0.27839699387550354,
+      "learning_rate": 0.00039819524183680384,
+      "loss": 2.8183,
+      "step": 310
+    },
+    {
+      "epoch": 0.017850012976382983,
+      "grad_norm": 0.1877232789993286,
+      "learning_rate": 0.00039729377638779857,
+      "loss": 2.7989,
+      "step": 311
+    },
+    {
+      "epoch": 0.01790740851649997,
+      "grad_norm": 0.25160273909568787,
+      "learning_rate": 0.00039639447748520985,
+      "loss": 2.8536,
+      "step": 312
+    },
+    {
+      "epoch": 0.01796480405661696,
+      "grad_norm": 0.23843353986740112,
+      "learning_rate": 0.0003954973820956932,
+      "loss": 2.8064,
+      "step": 313
+    },
+    {
+      "epoch": 0.018022199596733943,
+      "grad_norm": 0.2549470365047455,
+      "learning_rate": 0.00039460252709532656,
+      "loss": 2.8415,
+      "step": 314
+    },
+    {
+      "epoch": 0.01807959513685093,
+      "grad_norm": 0.39248892664909363,
+      "learning_rate": 0.0003937099492680938,
+      "loss": 2.8137,
+      "step": 315
+    },
+    {
+      "epoch": 0.018136990676967918,
+      "grad_norm": 0.24034982919692993,
+      "learning_rate": 0.0003928196853043737,
+      "loss": 2.8301,
+      "step": 316
+    },
+    {
+      "epoch": 0.018194386217084905,
+      "grad_norm": 0.29434794187545776,
+      "learning_rate": 0.00039193177179943083,
+      "loss": 2.8288,
+      "step": 317
+    },
+    {
+      "epoch": 0.018251781757201893,
+      "grad_norm": 0.21636317670345306,
+      "learning_rate": 0.0003910462452519114,
+      "loss": 2.8121,
+      "step": 318
+    },
+    {
+      "epoch": 0.01830917729731888,
+      "grad_norm": 0.2217407375574112,
+      "learning_rate": 0.0003901631420623437,
+      "loss": 2.8551,
+      "step": 319
+    },
+    {
+      "epoch": 0.01836657283743587,
+      "grad_norm": 0.20126426219940186,
+      "learning_rate": 0.0003892824985316409,
+      "loss": 2.7812,
+      "step": 320
+    },
+    {
+      "epoch": 0.018423968377552852,
+      "grad_norm": 0.20343463122844696,
+      "learning_rate": 0.0003884043508596093,
+      "loss": 2.7959,
+      "step": 321
+    },
+    {
+      "epoch": 0.01848136391766984,
+      "grad_norm": 0.22265484929084778,
+      "learning_rate": 0.00038752873514346015,
+      "loss": 2.8254,
+      "step": 322
+    },
+    {
+      "epoch": 0.018538759457786828,
+      "grad_norm": 0.20545947551727295,
+      "learning_rate": 0.000386655687376326,
+      "loss": 2.8166,
+      "step": 323
+    },
+    {
+      "epoch": 0.018596154997903815,
+      "grad_norm": 0.17015507817268372,
+      "learning_rate": 0.00038578524344578115,
+      "loss": 2.806,
+      "step": 324
+    },
+    {
+      "epoch": 0.018653550538020803,
+      "grad_norm": 0.19378258287906647,
+      "learning_rate": 0.00038491743913236624,
+      "loss": 2.7979,
+      "step": 325
+    },
+    {
+      "epoch": 0.01871094607813779,
+      "grad_norm": 0.2112617790699005,
+      "learning_rate": 0.0003840523101081177,
+      "loss": 2.8149,
+      "step": 326
+    },
+    {
+      "epoch": 0.018768341618254775,
+      "grad_norm": 0.18846029043197632,
+      "learning_rate": 0.0003831898919351011,
+      "loss": 2.8334,
+      "step": 327
+    },
+    {
+      "epoch": 0.018825737158371762,
+      "grad_norm": 0.20672033727169037,
+      "learning_rate": 0.00038233022006394976,
+      "loss": 2.8061,
+      "step": 328
+    },
+    {
+      "epoch": 0.01888313269848875,
+      "grad_norm": 0.2700256109237671,
+      "learning_rate": 0.00038147332983240717,
+      "loss": 2.8101,
+      "step": 329
+    },
+    {
+      "epoch": 0.018940528238605737,
+      "grad_norm": 0.16990099847316742,
+      "learning_rate": 0.00038061925646387467,
+      "loss": 2.8227,
+      "step": 330
+    },
+    {
+      "epoch": 0.018997923778722725,
+      "grad_norm": 0.2140357792377472,
+      "learning_rate": 0.0003797680350659631,
+      "loss": 2.8018,
+      "step": 331
+    },
+    {
+      "epoch": 0.019055319318839713,
+      "grad_norm": 0.2538260221481323,
+      "learning_rate": 0.0003789197006290502,
+      "loss": 2.7725,
+      "step": 332
+    },
+    {
+      "epoch": 0.0191127148589567,
+      "grad_norm": 0.1694011092185974,
+      "learning_rate": 0.0003780742880248419,
+      "loss": 2.7973,
+      "step": 333
+    },
+    {
+      "epoch": 0.019170110399073684,
+      "grad_norm": 0.2092764526605606,
+      "learning_rate": 0.0003772318320049391,
+      "loss": 2.8256,
+      "step": 334
+    },
+    {
+      "epoch": 0.019227505939190672,
+      "grad_norm": 0.22675682604312897,
+      "learning_rate": 0.0003763923671994093,
+      "loss": 2.8092,
+      "step": 335
+    },
+    {
+      "epoch": 0.01928490147930766,
+      "grad_norm": 0.20571155846118927,
+      "learning_rate": 0.0003755559281153625,
+      "loss": 2.8176,
+      "step": 336
+    },
+    {
+      "epoch": 0.019342297019424647,
+      "grad_norm": 0.18606650829315186,
+      "learning_rate": 0.0003747225491355334,
+      "loss": 2.8019,
+      "step": 337
+    },
+    {
+      "epoch": 0.019399692559541635,
+      "grad_norm": 0.19859890639781952,
+      "learning_rate": 0.00037389226451686763,
+      "loss": 2.8036,
+      "step": 338
+    },
+    {
+      "epoch": 0.019457088099658622,
+      "grad_norm": 0.1632896512746811,
+      "learning_rate": 0.00037306510838911404,
+      "loss": 2.797,
+      "step": 339
+    },
+    {
+      "epoch": 0.01951448363977561,
+      "grad_norm": 0.17494754493236542,
+      "learning_rate": 0.00037224111475342116,
+      "loss": 2.8152,
+      "step": 340
+    },
+    {
+      "epoch": 0.019571879179892594,
+      "grad_norm": 0.20659732818603516,
+      "learning_rate": 0.00037142031748094016,
+      "loss": 2.8061,
+      "step": 341
+    },
+    {
+      "epoch": 0.019629274720009582,
+      "grad_norm": 0.18716713786125183,
+      "learning_rate": 0.00037060275031143184,
+      "loss": 2.8419,
+      "step": 342
+    },
+    {
+      "epoch": 0.01968667026012657,
+      "grad_norm": 0.2575749158859253,
+      "learning_rate": 0.0003697884468518805,
+      "loss": 2.7814,
+      "step": 343
+    },
+    {
+      "epoch": 0.019744065800243557,
+      "grad_norm": 0.19076134264469147,
+      "learning_rate": 0.0003689774405751119,
+      "loss": 2.797,
+      "step": 344
+    },
+    {
+      "epoch": 0.019801461340360545,
+      "grad_norm": 0.19563442468643188,
+      "learning_rate": 0.00036816976481841764,
+      "loss": 2.8269,
+      "step": 345
+    },
+    {
+      "epoch": 0.019858856880477532,
+      "grad_norm": 0.1790810525417328,
+      "learning_rate": 0.0003673654527821846,
+      "loss": 2.7856,
+      "step": 346
+    },
+    {
+      "epoch": 0.019916252420594516,
+      "grad_norm": 0.2125868797302246,
+      "learning_rate": 0.00036656453752853025,
+      "loss": 2.7973,
+      "step": 347
+    },
+    {
+      "epoch": 0.019973647960711504,
+      "grad_norm": 0.1454995572566986,
+      "learning_rate": 0.00036576705197994376,
+      "loss": 2.7869,
+      "step": 348
+    },
+    {
+      "epoch": 0.02003104350082849,
+      "grad_norm": 0.2808379530906677,
+      "learning_rate": 0.00036497302891793255,
+      "loss": 2.7923,
+      "step": 349
+    },
+    {
+      "epoch": 0.02008843904094548,
+      "grad_norm": 0.1776140034198761,
+      "learning_rate": 0.0003641825009816745,
+      "loss": 2.8194,
+      "step": 350
+    },
+    {
+      "epoch": 0.020145834581062467,
+      "grad_norm": 0.22207793593406677,
+      "learning_rate": 0.0003633955006666771,
+      "loss": 2.8234,
+      "step": 351
+    },
+    {
+      "epoch": 0.020203230121179454,
+      "grad_norm": 0.24642404913902283,
+      "learning_rate": 0.0003626120603234406,
+      "loss": 2.8351,
+      "step": 352
+    },
+    {
+      "epoch": 0.020260625661296442,
+      "grad_norm": 0.24731726944446564,
+      "learning_rate": 0.000361832212156129,
+      "loss": 2.7983,
+      "step": 353
+    },
+    {
+      "epoch": 0.020318021201413426,
+      "grad_norm": 0.21677981317043304,
+      "learning_rate": 0.0003610559882212461,
+      "loss": 2.8372,
+      "step": 354
+    },
+    {
+      "epoch": 0.020375416741530414,
+      "grad_norm": 0.28350090980529785,
+      "learning_rate": 0.00036028342042631755,
+      "loss": 2.8138,
+      "step": 355
+    },
+    {
+      "epoch": 0.0204328122816474,
+      "grad_norm": 0.22418756783008575,
+      "learning_rate": 0.00035951454052857954,
+      "loss": 2.7897,
+      "step": 356
+    },
+    {
+      "epoch": 0.02049020782176439,
+      "grad_norm": 0.27765804529190063,
+      "learning_rate": 0.000358749380133673,
+      "loss": 2.8139,
+      "step": 357
+    },
+    {
+      "epoch": 0.020547603361881377,
+      "grad_norm": 0.2694258391857147,
+      "learning_rate": 0.000357987970694345,
+      "loss": 2.7881,
+      "step": 358
+    },
+    {
+      "epoch": 0.020604998901998364,
+      "grad_norm": 0.3746117055416107,
+      "learning_rate": 0.00035723034350915525,
+      "loss": 2.8108,
+      "step": 359
+    },
+    {
+      "epoch": 0.02066239444211535,
+      "grad_norm": 0.22864773869514465,
+      "learning_rate": 0.00035647652972119,
+      "loss": 2.8102,
+      "step": 360
+    },
+    {
+      "epoch": 0.020719789982232336,
+      "grad_norm": 0.2728801369667053,
+      "learning_rate": 0.0003557265603167814,
+      "loss": 2.8046,
+      "step": 361
+    },
+    {
+      "epoch": 0.020777185522349324,
+      "grad_norm": 0.2561710774898529,
+      "learning_rate": 0.0003549804661242345,
+      "loss": 2.8242,
+      "step": 362
+    },
+    {
+      "epoch": 0.02083458106246631,
+      "grad_norm": 0.26235631108283997,
+      "learning_rate": 0.00035423827781255914,
+      "loss": 2.847,
+      "step": 363
+    },
+    {
+      "epoch": 0.0208919766025833,
+      "grad_norm": 0.24725806713104248,
+      "learning_rate": 0.0003535000258902099,
+      "loss": 2.7873,
+      "step": 364
+    },
+    {
+      "epoch": 0.020949372142700286,
+      "grad_norm": 0.2562279999256134,
+      "learning_rate": 0.0003527657407038317,
+      "loss": 2.799,
+      "step": 365
+    },
+    {
+      "epoch": 0.021006767682817274,
+      "grad_norm": 0.20368199050426483,
+      "learning_rate": 0.00035203545243701266,
+      "loss": 2.8011,
+      "step": 366
+    },
+    {
+      "epoch": 0.021064163222934258,
+      "grad_norm": 0.25594958662986755,
+      "learning_rate": 0.0003513091911090431,
+      "loss": 2.8099,
+      "step": 367
+    },
+    {
+      "epoch": 0.021121558763051246,
+      "grad_norm": 0.20084761083126068,
+      "learning_rate": 0.00035058698657368154,
+      "loss": 2.8249,
+      "step": 368
+    },
+    {
+      "epoch": 0.021178954303168233,
+      "grad_norm": 0.24110020697116852,
+      "learning_rate": 0.00034986886851792775,
+      "loss": 2.8058,
+      "step": 369
+    },
+    {
+      "epoch": 0.02123634984328522,
+      "grad_norm": 0.2016633003950119,
+      "learning_rate": 0.0003491548664608024,
+      "loss": 2.7935,
+      "step": 370
+    },
+    {
+      "epoch": 0.02129374538340221,
+      "grad_norm": 0.2722468376159668,
+      "learning_rate": 0.0003484450097521336,
+      "loss": 2.8146,
+      "step": 371
+    },
+    {
+      "epoch": 0.021351140923519196,
+      "grad_norm": 0.2089434564113617,
+      "learning_rate": 0.0003477393275713501,
+      "loss": 2.8231,
+      "step": 372
+    },
+    {
+      "epoch": 0.021408536463636184,
+      "grad_norm": 0.24770453572273254,
+      "learning_rate": 0.0003470378489262824,
+      "loss": 2.7994,
+      "step": 373
+    },
+    {
+      "epoch": 0.021465932003753168,
+      "grad_norm": 0.21104897558689117,
+      "learning_rate": 0.00034634060265197026,
+      "loss": 2.8189,
+      "step": 374
+    },
+    {
+      "epoch": 0.021523327543870156,
+      "grad_norm": 0.23374824225902557,
+      "learning_rate": 0.000345647617409477,
+      "loss": 2.783,
+      "step": 375
+    },
+    {
+      "epoch": 0.021580723083987143,
+      "grad_norm": 0.24334168434143066,
+      "learning_rate": 0.00034495892168471176,
+      "loss": 2.8092,
+      "step": 376
+    },
+    {
+      "epoch": 0.02163811862410413,
+      "grad_norm": 0.22772932052612305,
+      "learning_rate": 0.00034427454378725827,
+      "loss": 2.8178,
+      "step": 377
+    },
+    {
+      "epoch": 0.02169551416422112,
+      "grad_norm": 0.22545067965984344,
+      "learning_rate": 0.00034359451184921125,
+      "loss": 2.7961,
+      "step": 378
+    },
+    {
+      "epoch": 0.021752909704338106,
+      "grad_norm": 0.2873929738998413,
+      "learning_rate": 0.00034291885382402044,
+      "loss": 2.8408,
+      "step": 379
+    },
+    {
+      "epoch": 0.02181030524445509,
+      "grad_norm": 0.2099824994802475,
+      "learning_rate": 0.00034224759748534083,
+      "loss": 2.782,
+      "step": 380
+    },
+    {
+      "epoch": 0.021867700784572078,
+      "grad_norm": 0.32221996784210205,
+      "learning_rate": 0.0003415807704258913,
+      "loss": 2.8337,
+      "step": 381
+    },
+    {
+      "epoch": 0.021925096324689065,
+      "grad_norm": 0.2531490623950958,
+      "learning_rate": 0.0003409184000563204,
+      "loss": 2.8273,
+      "step": 382
+    },
+    {
+      "epoch": 0.021982491864806053,
+      "grad_norm": 0.3075484037399292,
+      "learning_rate": 0.00034026051360407973,
+      "loss": 2.7805,
+      "step": 383
+    },
+    {
+      "epoch": 0.02203988740492304,
+      "grad_norm": 0.2366313338279724,
+      "learning_rate": 0.0003396071381123047,
+      "loss": 2.8278,
+      "step": 384
+    },
+    {
+      "epoch": 0.022097282945040028,
+      "grad_norm": 0.2348204106092453,
+      "learning_rate": 0.00033895830043870266,
+      "loss": 2.7922,
+      "step": 385
+    },
+    {
+      "epoch": 0.022154678485157016,
+      "grad_norm": 0.28124627470970154,
+      "learning_rate": 0.00033831402725444896,
+      "loss": 2.8065,
+      "step": 386
+    },
+    {
+      "epoch": 0.022212074025274,
+      "grad_norm": 0.1927008032798767,
+      "learning_rate": 0.0003376743450430907,
+      "loss": 2.7958,
+      "step": 387
+    },
+    {
+      "epoch": 0.022269469565390988,
+      "grad_norm": 0.26325997710227966,
+      "learning_rate": 0.0003370392800994583,
+      "loss": 2.8313,
+      "step": 388
+    },
+    {
+      "epoch": 0.022326865105507975,
+      "grad_norm": 0.23394963145256042,
+      "learning_rate": 0.0003364088585285842,
+      "loss": 2.8126,
+      "step": 389
+    },
+    {
+      "epoch": 0.022384260645624963,
+      "grad_norm": 0.26055994629859924,
+      "learning_rate": 0.00033578310624462983,
+      "loss": 2.787,
+      "step": 390
+    },
+    {
+      "epoch": 0.02244165618574195,
+      "grad_norm": 0.2207145392894745,
+      "learning_rate": 0.0003351620489698208,
+      "loss": 2.796,
+      "step": 391
+    },
+    {
+      "epoch": 0.022499051725858938,
+      "grad_norm": 0.34231698513031006,
+      "learning_rate": 0.0003345457122333891,
+      "loss": 2.7951,
+      "step": 392
+    },
+    {
+      "epoch": 0.022556447265975922,
+      "grad_norm": 0.22361671924591064,
+      "learning_rate": 0.00033393412137052396,
+      "loss": 2.8251,
+      "step": 393
+    },
+    {
+      "epoch": 0.02261384280609291,
+      "grad_norm": 0.24573372304439545,
+      "learning_rate": 0.0003333273015213304,
+      "loss": 2.7899,
+      "step": 394
+    },
+    {
+      "epoch": 0.022671238346209897,
+      "grad_norm": 0.22109688818454742,
+      "learning_rate": 0.0003327252776297955,
+      "loss": 2.8178,
+      "step": 395
+    },
+    {
+      "epoch": 0.022728633886326885,
+      "grad_norm": 0.22289875149726868,
+      "learning_rate": 0.00033212807444276364,
+      "loss": 2.8053,
+      "step": 396
+    },
+    {
+      "epoch": 0.022786029426443873,
+      "grad_norm": 0.21445147693157196,
+      "learning_rate": 0.00033153571650891865,
+      "loss": 2.7998,
+      "step": 397
+    },
+    {
+      "epoch": 0.02284342496656086,
+      "grad_norm": 0.25061139464378357,
+      "learning_rate": 0.00033094822817777514,
+      "loss": 2.8055,
+      "step": 398
+    },
+    {
+      "epoch": 0.022900820506677848,
+      "grad_norm": 0.24680854380130768,
+      "learning_rate": 0.0003303656335986773,
+      "loss": 2.8143,
+      "step": 399
+    },
+    {
+      "epoch": 0.022958216046794832,
+      "grad_norm": 0.16644932329654694,
+      "learning_rate": 0.0003297879567198065,
+      "loss": 2.8192,
+      "step": 400
+    },
+    {
+      "epoch": 0.022958216046794832,
+      "eval_loss": 2.738191604614258,
+      "eval_runtime": 85.3252,
+      "eval_samples_per_second": 50.571,
+      "eval_steps_per_second": 12.646,
+      "step": 400
+    },
+    {
+      "epoch": 0.02301561158691182,
+      "grad_norm": 0.2816384434700012,
+      "learning_rate": 0.00032921522128719657,
+      "loss": 2.8209,
+      "step": 401
+    },
+    {
+      "epoch": 0.023073007127028807,
+      "grad_norm": 0.20395685732364655,
+      "learning_rate": 0.00032864745084375783,
+      "loss": 2.8021,
+      "step": 402
+    },
+    {
+      "epoch": 0.023130402667145795,
+      "grad_norm": 0.24216794967651367,
+      "learning_rate": 0.00032808466872830957,
+      "loss": 2.8447,
+      "step": 403
+    },
+    {
+      "epoch": 0.023187798207262782,
+      "grad_norm": 0.2526738941669464,
+      "learning_rate": 0.00032752689807462017,
+      "loss": 2.7906,
+      "step": 404
+    },
+    {
+      "epoch": 0.02324519374737977,
+      "grad_norm": 0.21725283563137054,
+      "learning_rate": 0.0003269741618104566,
+      "loss": 2.7943,
+      "step": 405
+    },
+    {
+      "epoch": 0.023302589287496758,
+      "grad_norm": 0.2765718102455139,
+      "learning_rate": 0.00032642648265664175,
+      "loss": 2.8109,
+      "step": 406
+    },
+    {
+      "epoch": 0.02335998482761374,
+      "grad_norm": 0.20015880465507507,
+      "learning_rate": 0.00032588388312612053,
+      "loss": 2.8239,
+      "step": 407
+    },
+    {
+      "epoch": 0.02341738036773073,
+      "grad_norm": 0.26865240931510925,
+      "learning_rate": 0.0003253463855230344,
+      "loss": 2.8279,
+      "step": 408
+    },
+    {
+      "epoch": 0.023474775907847717,
+      "grad_norm": 0.23522211611270905,
+      "learning_rate": 0.0003248140119418046,
+      "loss": 2.8123,
+      "step": 409
+    },
+    {
+      "epoch": 0.023532171447964705,
+      "grad_norm": 0.2388644963502884,
+      "learning_rate": 0.0003242867842662239,
+      "loss": 2.8057,
+      "step": 410
+    },
+    {
+      "epoch": 0.023589566988081692,
+      "grad_norm": 0.18323197960853577,
+      "learning_rate": 0.00032376472416855703,
+      "loss": 2.8193,
+      "step": 411
+    },
+    {
+      "epoch": 0.02364696252819868,
+      "grad_norm": 0.24734856188297272,
+      "learning_rate": 0.00032324785310864983,
+      "loss": 2.7924,
+      "step": 412
+    },
+    {
+      "epoch": 0.023704358068315664,
+      "grad_norm": 0.1722363829612732,
+      "learning_rate": 0.0003227361923330471,
+      "loss": 2.8242,
+      "step": 413
+    },
+    {
+      "epoch": 0.02376175360843265,
+      "grad_norm": 0.2052358090877533,
+      "learning_rate": 0.00032222976287411934,
+      "loss": 2.8129,
+      "step": 414
+    },
+    {
+      "epoch": 0.02381914914854964,
+      "grad_norm": 0.2536105811595917,
+      "learning_rate": 0.00032172858554919807,
+      "loss": 2.8207,
+      "step": 415
+    },
+    {
+      "epoch": 0.023876544688666627,
+      "grad_norm": 0.23084022104740143,
+      "learning_rate": 0.00032123268095972005,
+      "loss": 2.8156,
+      "step": 416
+    },
+    {
+      "epoch": 0.023933940228783614,
+      "grad_norm": 0.28741586208343506,
+      "learning_rate": 0.00032074206949038073,
+      "loss": 2.8008,
+      "step": 417
+    },
+    {
+      "epoch": 0.023991335768900602,
+      "grad_norm": 0.2419297993183136,
+      "learning_rate": 0.0003202567713082959,
+      "loss": 2.8112,
+      "step": 418
+    },
+    {
+      "epoch": 0.02404873130901759,
+      "grad_norm": 0.19744537770748138,
+      "learning_rate": 0.0003197768063621732,
+      "loss": 2.7894,
+      "step": 419
+    },
+    {
+      "epoch": 0.024106126849134574,
+      "grad_norm": 0.22780993580818176,
+      "learning_rate": 0.0003193021943814916,
+      "loss": 2.8019,
+      "step": 420
+    },
+    {
+      "epoch": 0.02416352238925156,
+      "grad_norm": 0.2176397144794464,
+      "learning_rate": 0.00031883295487569063,
+      "loss": 2.8183,
+      "step": 421
+    },
+    {
+      "epoch": 0.02422091792936855,
+      "grad_norm": 0.23891203105449677,
+      "learning_rate": 0.00031836910713336857,
+      "loss": 2.8022,
+      "step": 422
+    },
+    {
+      "epoch": 0.024278313469485537,
+      "grad_norm": 0.18507017195224762,
+      "learning_rate": 0.0003179106702214893,
+      "loss": 2.8013,
+      "step": 423
+    },
+    {
+      "epoch": 0.024335709009602524,
+      "grad_norm": 0.20408926904201508,
+      "learning_rate": 0.0003174576629845987,
+      "loss": 2.8085,
+      "step": 424
+    },
+    {
+      "epoch": 0.024393104549719512,
+      "grad_norm": 0.18055075407028198,
+      "learning_rate": 0.00031701010404404996,
+      "loss": 2.8341,
+      "step": 425
+    },
+    {
+      "epoch": 0.024450500089836496,
+      "grad_norm": 0.22974956035614014,
+      "learning_rate": 0.0003165680117972382,
+      "loss": 2.8044,
+      "step": 426
+    },
+    {
+      "epoch": 0.024507895629953484,
+      "grad_norm": 0.17688511312007904,
+      "learning_rate": 0.00031613140441684413,
+      "loss": 2.7866,
+      "step": 427
+    },
+    {
+      "epoch": 0.02456529117007047,
+      "grad_norm": 0.22350828349590302,
+      "learning_rate": 0.000315700299850087,
+      "loss": 2.7939,
+      "step": 428
+    },
+    {
+      "epoch": 0.02462268671018746,
+      "grad_norm": 0.2138863056898117,
+      "learning_rate": 0.0003152747158179871,
+      "loss": 2.8112,
+      "step": 429
+    },
+    {
+      "epoch": 0.024680082250304446,
+      "grad_norm": 0.1666262447834015,
+      "learning_rate": 0.0003148546698146371,
+      "loss": 2.8464,
+      "step": 430
+    },
+    {
+      "epoch": 0.024737477790421434,
+      "grad_norm": 0.23217864334583282,
+      "learning_rate": 0.00031444017910648293,
+      "loss": 2.8154,
+      "step": 431
+    },
+    {
+      "epoch": 0.02479487333053842,
+      "grad_norm": 0.23967209458351135,
+      "learning_rate": 0.00031403126073161424,
+      "loss": 2.8068,
+      "step": 432
+    },
+    {
+      "epoch": 0.024852268870655406,
+      "grad_norm": 0.2363416850566864,
+      "learning_rate": 0.0003136279314990637,
+      "loss": 2.832,
+      "step": 433
+    },
+    {
+      "epoch": 0.024909664410772393,
+      "grad_norm": 0.20204566419124603,
+      "learning_rate": 0.00031323020798811643,
+      "loss": 2.8118,
+      "step": 434
+    },
+    {
+      "epoch": 0.02496705995088938,
+      "grad_norm": 0.2645012438297272,
+      "learning_rate": 0.00031283810654762816,
+      "loss": 2.7988,
+      "step": 435
+    },
+    {
+      "epoch": 0.02502445549100637,
+      "grad_norm": 0.31096434593200684,
+      "learning_rate": 0.0003124516432953532,
+      "loss": 2.8021,
+      "step": 436
+    },
+    {
+      "epoch": 0.025081851031123356,
+      "grad_norm": 0.25740697979927063,
+      "learning_rate": 0.00031207083411728236,
+      "loss": 2.828,
+      "step": 437
+    },
+    {
+      "epoch": 0.025139246571240344,
+      "grad_norm": 0.24895477294921875,
+      "learning_rate": 0.00031169569466698937,
+      "loss": 2.8073,
+      "step": 438
+    },
+    {
+      "epoch": 0.02519664211135733,
+      "grad_norm": 0.2860502004623413,
+      "learning_rate": 0.00031132624036498774,
+      "loss": 2.8275,
+      "step": 439
+    },
+    {
+      "epoch": 0.025254037651474315,
+      "grad_norm": 0.3134096562862396,
+      "learning_rate": 0.00031096248639809674,
+      "loss": 2.816,
+      "step": 440
+    },
+    {
+      "epoch": 0.025311433191591303,
+      "grad_norm": 0.2185070812702179,
+      "learning_rate": 0.0003106044477188172,
+      "loss": 2.7799,
+      "step": 441
+    },
+    {
+      "epoch": 0.02536882873170829,
+      "grad_norm": 0.3582714796066284,
+      "learning_rate": 0.0003102521390447169,
+      "loss": 2.7923,
+      "step": 442
+    },
+    {
+      "epoch": 0.02542622427182528,
+      "grad_norm": 0.19494207203388214,
+      "learning_rate": 0.00030990557485782553,
+      "loss": 2.7999,
+      "step": 443
+    },
+    {
+      "epoch": 0.025483619811942266,
+      "grad_norm": 0.2574940025806427,
+      "learning_rate": 0.0003095647694040394,
+      "loss": 2.8087,
+      "step": 444
+    },
+    {
+      "epoch": 0.025541015352059254,
+      "grad_norm": 0.17501215636730194,
+      "learning_rate": 0.0003092297366925359,
+      "loss": 2.7817,
+      "step": 445
+    },
+    {
+      "epoch": 0.025598410892176238,
+      "grad_norm": 0.4073377251625061,
+      "learning_rate": 0.0003089004904951976,
+      "loss": 2.813,
+      "step": 446
+    },
+    {
+      "epoch": 0.025655806432293225,
+      "grad_norm": 0.21654489636421204,
+      "learning_rate": 0.000308577044346046,
+      "loss": 2.8165,
+      "step": 447
+    },
+    {
+      "epoch": 0.025713201972410213,
+      "grad_norm": 0.26500189304351807,
+      "learning_rate": 0.0003082594115406856,
+      "loss": 2.8229,
+      "step": 448
+    },
+    {
+      "epoch": 0.0257705975125272,
+      "grad_norm": 0.188262477517128,
+      "learning_rate": 0.00030794760513575675,
+      "loss": 2.8112,
+      "step": 449
+    },
+    {
+      "epoch": 0.025827993052644188,
+      "grad_norm": 0.3432970643043518,
+      "learning_rate": 0.00030764163794839966,
+      "loss": 2.8241,
+      "step": 450
+    },
+    {
+      "epoch": 0.025885388592761176,
+      "grad_norm": 0.23415225744247437,
+      "learning_rate": 0.0003073415225557269,
+      "loss": 2.8039,
+      "step": 451
+    },
+    {
+      "epoch": 0.025942784132878163,
+      "grad_norm": 0.2670385241508484,
+      "learning_rate": 0.0003070472712943069,
+      "loss": 2.8215,
+      "step": 452
+    },
+    {
+      "epoch": 0.026000179672995147,
+      "grad_norm": 0.17434735596179962,
+      "learning_rate": 0.00030675889625965646,
+      "loss": 2.8352,
+      "step": 453
+    },
+    {
+      "epoch": 0.026057575213112135,
+      "grad_norm": 0.2789264917373657,
+      "learning_rate": 0.0003064764093057437,
+      "loss": 2.7856,
+      "step": 454
+    },
+    {
+      "epoch": 0.026114970753229123,
+      "grad_norm": 0.2666022479534149,
+      "learning_rate": 0.0003061998220445009,
+      "loss": 2.8063,
+      "step": 455
+    },
+    {
+      "epoch": 0.02617236629334611,
+      "grad_norm": 0.22438260912895203,
+      "learning_rate": 0.00030592914584534706,
+      "loss": 2.7783,
+      "step": 456
+    },
+    {
+      "epoch": 0.026229761833463098,
+      "grad_norm": 0.2177169770002365,
+      "learning_rate": 0.00030566439183472063,
+      "loss": 2.786,
+      "step": 457
+    },
+    {
+      "epoch": 0.026287157373580086,
+      "grad_norm": 0.22771142423152924,
+      "learning_rate": 0.000305405570895622,
+      "loss": 2.7881,
+      "step": 458
+    },
+    {
+      "epoch": 0.02634455291369707,
+      "grad_norm": 0.29228097200393677,
+      "learning_rate": 0.00030515269366716613,
+      "loss": 2.7876,
+      "step": 459
+    },
+    {
+      "epoch": 0.026401948453814057,
+      "grad_norm": 0.18204721808433533,
+      "learning_rate": 0.00030490577054414553,
+      "loss": 2.8153,
+      "step": 460
+    },
+    {
+      "epoch": 0.026459343993931045,
+      "grad_norm": 0.19830970466136932,
+      "learning_rate": 0.0003046648116766027,
+      "loss": 2.7884,
+      "step": 461
+    },
+    {
+      "epoch": 0.026516739534048032,
+      "grad_norm": 0.17311398684978485,
+      "learning_rate": 0.00030442982696941276,
+      "loss": 2.8055,
+      "step": 462
+    },
+    {
+      "epoch": 0.02657413507416502,
+      "grad_norm": 0.21194536983966827,
+      "learning_rate": 0.0003042008260818768,
+      "loss": 2.815,
+      "step": 463
+    },
+    {
+      "epoch": 0.026631530614282008,
+      "grad_norm": 0.22366400063037872,
+      "learning_rate": 0.0003039778184273243,
+      "loss": 2.7994,
+      "step": 464
+    },
+    {
+      "epoch": 0.026688926154398995,
+      "grad_norm": 0.17785237729549408,
+      "learning_rate": 0.00030376081317272645,
+      "loss": 2.8049,
+      "step": 465
+    },
+    {
+      "epoch": 0.02674632169451598,
+      "grad_norm": 0.2285715490579605,
+      "learning_rate": 0.00030354981923831934,
+      "loss": 2.8105,
+      "step": 466
+    },
+    {
+      "epoch": 0.026803717234632967,
+      "grad_norm": 0.17985928058624268,
+      "learning_rate": 0.0003033448452972373,
+      "loss": 2.8246,
+      "step": 467
+    },
+    {
+      "epoch": 0.026861112774749955,
+      "grad_norm": 0.2026437669992447,
+      "learning_rate": 0.000303145899775156,
+      "loss": 2.8192,
+      "step": 468
+    },
+    {
+      "epoch": 0.026918508314866942,
+      "grad_norm": 0.2605213522911072,
+      "learning_rate": 0.0003029529908499469,
+      "loss": 2.826,
+      "step": 469
+    },
+    {
+      "epoch": 0.02697590385498393,
+      "grad_norm": 0.22592206299304962,
+      "learning_rate": 0.00030276612645134017,
+      "loss": 2.7987,
+      "step": 470
+    },
+    {
+      "epoch": 0.027033299395100917,
+      "grad_norm": 0.2988434433937073,
+      "learning_rate": 0.0003025853142605994,
+      "loss": 2.826,
+      "step": 471
+    },
+    {
+      "epoch": 0.027090694935217905,
+      "grad_norm": 0.2247052788734436,
+      "learning_rate": 0.0003024105617102055,
+      "loss": 2.815,
+      "step": 472
+    },
+    {
+      "epoch": 0.02714809047533489,
+      "grad_norm": 0.26565778255462646,
+      "learning_rate": 0.00030224187598355145,
+      "loss": 2.8283,
+      "step": 473
+    },
+    {
+      "epoch": 0.027205486015451877,
+      "grad_norm": 0.2834932804107666,
+      "learning_rate": 0.00030207926401464675,
+      "loss": 2.8088,
+      "step": 474
+    },
+    {
+      "epoch": 0.027262881555568864,
+      "grad_norm": 0.2396688312292099,
+      "learning_rate": 0.0003019227324878324,
+      "loss": 2.8024,
+      "step": 475
+    },
+    {
+      "epoch": 0.027320277095685852,
+      "grad_norm": 0.2600051760673523,
+      "learning_rate": 0.0003017722878375066,
+      "loss": 2.8258,
+      "step": 476
+    },
+    {
+      "epoch": 0.02737767263580284,
+      "grad_norm": 0.26368406414985657,
+      "learning_rate": 0.00030162793624785957,
+      "loss": 2.7875,
+      "step": 477
+    },
+    {
+      "epoch": 0.027435068175919827,
+      "grad_norm": 0.389852911233902,
+      "learning_rate": 0.0003014896836526197,
+      "loss": 2.8166,
+      "step": 478
+    },
+    {
+      "epoch": 0.02749246371603681,
+      "grad_norm": 0.23984675109386444,
+      "learning_rate": 0.0003013575357348098,
+      "loss": 2.8025,
+      "step": 479
+    },
+    {
+      "epoch": 0.0275498592561538,
+      "grad_norm": 0.24591901898384094,
+      "learning_rate": 0.00030123149792651307,
+      "loss": 2.7898,
+      "step": 480
+    },
+    {
+      "epoch": 0.027607254796270787,
+      "grad_norm": 0.24797213077545166,
+      "learning_rate": 0.00030111157540865026,
+      "loss": 2.8291,
+      "step": 481
+    },
+    {
+      "epoch": 0.027664650336387774,
+      "grad_norm": 0.2542579770088196,
+      "learning_rate": 0.0003009977731107663,
+      "loss": 2.7868,
+      "step": 482
+    },
+    {
+      "epoch": 0.027722045876504762,
+      "grad_norm": 0.21780452132225037,
+      "learning_rate": 0.00030089009571082794,
+      "loss": 2.8051,
+      "step": 483
+    },
+    {
+      "epoch": 0.02777944141662175,
+      "grad_norm": 0.2790198028087616,
+      "learning_rate": 0.0003007885476350314,
+      "loss": 2.8004,
+      "step": 484
+    },
+    {
+      "epoch": 0.027836836956738737,
+      "grad_norm": 0.2793212831020355,
+      "learning_rate": 0.00030069313305762025,
+      "loss": 2.8077,
+      "step": 485
+    },
+    {
+      "epoch": 0.02789423249685572,
+      "grad_norm": 0.2663847506046295,
+      "learning_rate": 0.0003006038559007141,
+      "loss": 2.805,
+      "step": 486
+    },
+    {
+      "epoch": 0.02795162803697271,
+      "grad_norm": 0.2695571482181549,
+      "learning_rate": 0.0003005207198341473,
+      "loss": 2.8102,
+      "step": 487
+    },
+    {
+      "epoch": 0.028009023577089696,
+      "grad_norm": 0.3027716875076294,
+      "learning_rate": 0.0003004437282753177,
+      "loss": 2.7944,
+      "step": 488
+    },
+    {
+      "epoch": 0.028066419117206684,
+      "grad_norm": 0.25220444798469543,
+      "learning_rate": 0.0003003728843890469,
+      "loss": 2.781,
+      "step": 489
+    },
+    {
+      "epoch": 0.02812381465732367,
+      "grad_norm": 0.2733742594718933,
+      "learning_rate": 0.0003003081910874495,
+      "loss": 2.8138,
+      "step": 490
+    },
+    {
+      "epoch": 0.02818121019744066,
+      "grad_norm": 0.23873530328273773,
+      "learning_rate": 0.00030024965102981387,
+      "loss": 2.8017,
+      "step": 491
+    },
+    {
+      "epoch": 0.028238605737557643,
+      "grad_norm": 0.29158100485801697,
+      "learning_rate": 0.0003001972666224923,
+      "loss": 2.8084,
+      "step": 492
+    },
+    {
+      "epoch": 0.02829600127767463,
+      "grad_norm": 0.3079324960708618,
+      "learning_rate": 0.00030015104001880274,
+      "loss": 2.8061,
+      "step": 493
+    },
+    {
+      "epoch": 0.02835339681779162,
+      "grad_norm": 0.2448122203350067,
+      "learning_rate": 0.00030011097311893984,
+      "loss": 2.7817,
+      "step": 494
+    },
+    {
+      "epoch": 0.028410792357908606,
+      "grad_norm": 0.3495275378227234,
+      "learning_rate": 0.00030007706756989683,
+      "loss": 2.8053,
+      "step": 495
+    },
+    {
+      "epoch": 0.028468187898025594,
+      "grad_norm": 0.19935691356658936,
+      "learning_rate": 0.000300049324765398,
+      "loss": 2.7985,
+      "step": 496
+    },
+    {
+      "epoch": 0.02852558343814258,
+      "grad_norm": 0.30157798528671265,
+      "learning_rate": 0.0003000277458458415,
+      "loss": 2.8271,
+      "step": 497
+    },
+    {
+      "epoch": 0.02858297897825957,
+      "grad_norm": 0.23343823850154877,
+      "learning_rate": 0.00030001233169825214,
+      "loss": 2.807,
+      "step": 498
+    },
+    {
+      "epoch": 0.028640374518376553,
+      "grad_norm": 0.25404173135757446,
+      "learning_rate": 0.0003000030829562451,
+      "loss": 2.8072,
+      "step": 499
+    },
+    {
+      "epoch": 0.02869777005849354,
+      "grad_norm": 0.28863540291786194,
+      "learning_rate": 0.0003,
+      "loss": 2.8088,
+      "step": 500
+    },
+    {
+      "epoch": 0.02869777005849354,
+      "eval_loss": 2.735079288482666,
+      "eval_runtime": 85.4355,
+      "eval_samples_per_second": 50.506,
+      "eval_steps_per_second": 12.629,
+      "step": 500
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 500,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 150,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 4.69922551431168e+17,
+  "train_batch_size": 22,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:594db8e3ba17a8fa3661cdc5102444839e6fe80ed0f8414f52615396a149cc65
+size 5368