Model save

Browse files

Files changed (7) hide show

README.md +57 -0
all_results.json +9 -0
generation_config.json +6 -0
model-00001-of-00002.safetensors +1 -1
model-00002-of-00002.safetensors +1 -1
train_results.json +9 -0
trainer_state.json +1799 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: transformers
+model_name: Qwen2.5-3B-SFT-NR
+tags:
+- generated_from_trainer
+- trl
+- sft
+licence: license
+---
+# Model Card for Qwen2.5-3B-SFT-NR
+This model is a fine-tuned version of [None](https://huggingface.co/None).
+It has been trained using [TRL](https://github.com/huggingface/trl).
+## Quick start
+```python
+from transformers import pipeline
+question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
+generator = pipeline("text-generation", model="qingyangzhang/Qwen2.5-3B-SFT-NR", device="cuda")
+output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
+print(output["generated_text"])
+```
+## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/rq6uid9l)
+This model was trained with SFT.
+### Framework versions
+- TRL: 0.14.0
+- Transformers: 4.48.3
+- Pytorch: 2.5.1+cu124
+- Datasets: 3.1.0
+- Tokenizers: 0.21.0
+## Citations
+Cite TRL as:
+```bibtex
+@misc{vonwerra2022trl,
+	title        = {{TRL: Transformer Reinforcement Learning}},
+	author       = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
+	year         = 2020,
+	journal      = {GitHub repository},
+	publisher    = {GitHub},
+	howpublished = {\url{https://github.com/huggingface/trl}}
+}
+```

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9990049751243781,
+    "total_flos": 5.849427398046515e+16,
+    "train_loss": 0.6000438257755036,
+    "train_runtime": 5544.176,
+    "train_samples": 12058,
+    "train_samples_per_second": 2.175,
+    "train_steps_per_second": 0.045
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "max_new_tokens": 2048,
+  "transformers_version": "4.48.3"
+}

model-00001-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c788a82bbb8faf001079f2e87f41410938d87ed8b263b32f88c551ea5d55db96
 size 4957560304

 version https://git-lfs.github.com/spec/v1
+oid sha256:ce9686eb561a861b9768d33d2b9b535fdcb85adcd9d1a35b60a9f7275d446ea8
 size 4957560304

model-00002-of-00002.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a2d30950a93d392cd8a0d85e276265694aa8f8765ffe5f3075c755b45884dae2
 size 1214366696

 version https://git-lfs.github.com/spec/v1
+oid sha256:3dad05e32430f7203f65bd0ee0e192096c8d5c835279b21d402a13a60d66a713
 size 1214366696

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9990049751243781,
+    "total_flos": 5.849427398046515e+16,
+    "train_loss": 0.6000438257755036,
+    "train_runtime": 5544.176,
+    "train_samples": 12058,
+    "train_samples_per_second": 2.175,
+    "train_steps_per_second": 0.045
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1799 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9990049751243781,
+  "eval_steps": 100,
+  "global_step": 251,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.003980099502487562,
+      "grad_norm": 1.8041054010391235,
+      "learning_rate": 1e-06,
+      "loss": 1.2189,
+      "step": 1
+    },
+    {
+      "epoch": 0.007960199004975124,
+      "grad_norm": 1.606231451034546,
+      "learning_rate": 1e-06,
+      "loss": 1.1372,
+      "step": 2
+    },
+    {
+      "epoch": 0.011940298507462687,
+      "grad_norm": 1.5864207744598389,
+      "learning_rate": 1e-06,
+      "loss": 1.1304,
+      "step": 3
+    },
+    {
+      "epoch": 0.015920398009950248,
+      "grad_norm": 1.6760976314544678,
+      "learning_rate": 1e-06,
+      "loss": 1.3062,
+      "step": 4
+    },
+    {
+      "epoch": 0.01990049751243781,
+      "grad_norm": 1.0894922018051147,
+      "learning_rate": 1e-06,
+      "loss": 1.0851,
+      "step": 5
+    },
+    {
+      "epoch": 0.023880597014925373,
+      "grad_norm": 1.197535514831543,
+      "learning_rate": 1e-06,
+      "loss": 1.2047,
+      "step": 6
+    },
+    {
+      "epoch": 0.027860696517412936,
+      "grad_norm": 1.1388044357299805,
+      "learning_rate": 1e-06,
+      "loss": 1.1113,
+      "step": 7
+    },
+    {
+      "epoch": 0.031840796019900496,
+      "grad_norm": 1.118986964225769,
+      "learning_rate": 1e-06,
+      "loss": 1.129,
+      "step": 8
+    },
+    {
+      "epoch": 0.03582089552238806,
+      "grad_norm": 1.1859782934188843,
+      "learning_rate": 1e-06,
+      "loss": 1.1824,
+      "step": 9
+    },
+    {
+      "epoch": 0.03980099502487562,
+      "grad_norm": 1.0059858560562134,
+      "learning_rate": 1e-06,
+      "loss": 1.0065,
+      "step": 10
+    },
+    {
+      "epoch": 0.04378109452736319,
+      "grad_norm": 1.088222622871399,
+      "learning_rate": 1e-06,
+      "loss": 1.0444,
+      "step": 11
+    },
+    {
+      "epoch": 0.04776119402985075,
+      "grad_norm": 1.1014976501464844,
+      "learning_rate": 1e-06,
+      "loss": 1.0654,
+      "step": 12
+    },
+    {
+      "epoch": 0.051741293532338306,
+      "grad_norm": 0.9451438188552856,
+      "learning_rate": 1e-06,
+      "loss": 0.9278,
+      "step": 13
+    },
+    {
+      "epoch": 0.05572139303482587,
+      "grad_norm": 1.109749436378479,
+      "learning_rate": 1e-06,
+      "loss": 1.0397,
+      "step": 14
+    },
+    {
+      "epoch": 0.05970149253731343,
+      "grad_norm": 1.0444092750549316,
+      "learning_rate": 1e-06,
+      "loss": 1.0032,
+      "step": 15
+    },
+    {
+      "epoch": 0.06368159203980099,
+      "grad_norm": 1.056223750114441,
+      "learning_rate": 1e-06,
+      "loss": 0.9843,
+      "step": 16
+    },
+    {
+      "epoch": 0.06766169154228856,
+      "grad_norm": 0.9420551657676697,
+      "learning_rate": 1e-06,
+      "loss": 0.8722,
+      "step": 17
+    },
+    {
+      "epoch": 0.07164179104477612,
+      "grad_norm": 0.9519243240356445,
+      "learning_rate": 1e-06,
+      "loss": 0.8713,
+      "step": 18
+    },
+    {
+      "epoch": 0.07562189054726368,
+      "grad_norm": 0.8667258620262146,
+      "learning_rate": 1e-06,
+      "loss": 0.8384,
+      "step": 19
+    },
+    {
+      "epoch": 0.07960199004975124,
+      "grad_norm": 0.9024590253829956,
+      "learning_rate": 1e-06,
+      "loss": 0.8198,
+      "step": 20
+    },
+    {
+      "epoch": 0.08358208955223881,
+      "grad_norm": 0.8790098428726196,
+      "learning_rate": 1e-06,
+      "loss": 0.8539,
+      "step": 21
+    },
+    {
+      "epoch": 0.08756218905472637,
+      "grad_norm": 0.7695945501327515,
+      "learning_rate": 1e-06,
+      "loss": 0.797,
+      "step": 22
+    },
+    {
+      "epoch": 0.09154228855721393,
+      "grad_norm": 0.830602765083313,
+      "learning_rate": 1e-06,
+      "loss": 0.8038,
+      "step": 23
+    },
+    {
+      "epoch": 0.0955223880597015,
+      "grad_norm": 0.7355982661247253,
+      "learning_rate": 1e-06,
+      "loss": 0.7901,
+      "step": 24
+    },
+    {
+      "epoch": 0.09950248756218906,
+      "grad_norm": 0.7058648467063904,
+      "learning_rate": 1e-06,
+      "loss": 0.7933,
+      "step": 25
+    },
+    {
+      "epoch": 0.10348258706467661,
+      "grad_norm": 0.8061387538909912,
+      "learning_rate": 1e-06,
+      "loss": 0.7369,
+      "step": 26
+    },
+    {
+      "epoch": 0.10746268656716418,
+      "grad_norm": 0.7414054870605469,
+      "learning_rate": 1e-06,
+      "loss": 0.7798,
+      "step": 27
+    },
+    {
+      "epoch": 0.11144278606965174,
+      "grad_norm": 0.7229103446006775,
+      "learning_rate": 1e-06,
+      "loss": 0.7071,
+      "step": 28
+    },
+    {
+      "epoch": 0.1154228855721393,
+      "grad_norm": 0.6890265345573425,
+      "learning_rate": 1e-06,
+      "loss": 0.649,
+      "step": 29
+    },
+    {
+      "epoch": 0.11940298507462686,
+      "grad_norm": 0.6917344927787781,
+      "learning_rate": 1e-06,
+      "loss": 0.7381,
+      "step": 30
+    },
+    {
+      "epoch": 0.12338308457711443,
+      "grad_norm": 0.6370529532432556,
+      "learning_rate": 1e-06,
+      "loss": 0.7016,
+      "step": 31
+    },
+    {
+      "epoch": 0.12736318407960198,
+      "grad_norm": 0.5392922163009644,
+      "learning_rate": 1e-06,
+      "loss": 0.5861,
+      "step": 32
+    },
+    {
+      "epoch": 0.13134328358208955,
+      "grad_norm": 0.5614864826202393,
+      "learning_rate": 1e-06,
+      "loss": 0.637,
+      "step": 33
+    },
+    {
+      "epoch": 0.13532338308457711,
+      "grad_norm": 0.5575302839279175,
+      "learning_rate": 1e-06,
+      "loss": 0.6303,
+      "step": 34
+    },
+    {
+      "epoch": 0.13930348258706468,
+      "grad_norm": 0.5416925549507141,
+      "learning_rate": 1e-06,
+      "loss": 0.6533,
+      "step": 35
+    },
+    {
+      "epoch": 0.14328358208955225,
+      "grad_norm": 0.5551822185516357,
+      "learning_rate": 1e-06,
+      "loss": 0.6362,
+      "step": 36
+    },
+    {
+      "epoch": 0.1472636815920398,
+      "grad_norm": 0.5346453785896301,
+      "learning_rate": 1e-06,
+      "loss": 0.6369,
+      "step": 37
+    },
+    {
+      "epoch": 0.15124378109452735,
+      "grad_norm": 0.48347029089927673,
+      "learning_rate": 1e-06,
+      "loss": 0.6146,
+      "step": 38
+    },
+    {
+      "epoch": 0.15522388059701492,
+      "grad_norm": 0.5139867663383484,
+      "learning_rate": 1e-06,
+      "loss": 0.6108,
+      "step": 39
+    },
+    {
+      "epoch": 0.15920398009950248,
+      "grad_norm": 0.492990642786026,
+      "learning_rate": 1e-06,
+      "loss": 0.6167,
+      "step": 40
+    },
+    {
+      "epoch": 0.16318407960199005,
+      "grad_norm": 0.4089691638946533,
+      "learning_rate": 1e-06,
+      "loss": 0.5995,
+      "step": 41
+    },
+    {
+      "epoch": 0.16716417910447762,
+      "grad_norm": 0.3620274066925049,
+      "learning_rate": 1e-06,
+      "loss": 0.5853,
+      "step": 42
+    },
+    {
+      "epoch": 0.17114427860696518,
+      "grad_norm": 0.35234397649765015,
+      "learning_rate": 1e-06,
+      "loss": 0.5983,
+      "step": 43
+    },
+    {
+      "epoch": 0.17512437810945275,
+      "grad_norm": 0.3323567509651184,
+      "learning_rate": 1e-06,
+      "loss": 0.5675,
+      "step": 44
+    },
+    {
+      "epoch": 0.1791044776119403,
+      "grad_norm": 0.3100694417953491,
+      "learning_rate": 1e-06,
+      "loss": 0.6015,
+      "step": 45
+    },
+    {
+      "epoch": 0.18308457711442785,
+      "grad_norm": 0.31179943680763245,
+      "learning_rate": 1e-06,
+      "loss": 0.592,
+      "step": 46
+    },
+    {
+      "epoch": 0.18706467661691542,
+      "grad_norm": 0.3240714967250824,
+      "learning_rate": 1e-06,
+      "loss": 0.5945,
+      "step": 47
+    },
+    {
+      "epoch": 0.191044776119403,
+      "grad_norm": 0.30923616886138916,
+      "learning_rate": 1e-06,
+      "loss": 0.5788,
+      "step": 48
+    },
+    {
+      "epoch": 0.19502487562189055,
+      "grad_norm": 0.3096090257167816,
+      "learning_rate": 1e-06,
+      "loss": 0.5884,
+      "step": 49
+    },
+    {
+      "epoch": 0.19900497512437812,
+      "grad_norm": 0.2709506154060364,
+      "learning_rate": 1e-06,
+      "loss": 0.544,
+      "step": 50
+    },
+    {
+      "epoch": 0.20298507462686566,
+      "grad_norm": 0.3078024089336395,
+      "learning_rate": 1e-06,
+      "loss": 0.5854,
+      "step": 51
+    },
+    {
+      "epoch": 0.20696517412935322,
+      "grad_norm": 0.31205838918685913,
+      "learning_rate": 1e-06,
+      "loss": 0.5846,
+      "step": 52
+    },
+    {
+      "epoch": 0.2109452736318408,
+      "grad_norm": 0.2879401743412018,
+      "learning_rate": 1e-06,
+      "loss": 0.5937,
+      "step": 53
+    },
+    {
+      "epoch": 0.21492537313432836,
+      "grad_norm": 0.2684524953365326,
+      "learning_rate": 1e-06,
+      "loss": 0.5209,
+      "step": 54
+    },
+    {
+      "epoch": 0.21890547263681592,
+      "grad_norm": 0.27748343348503113,
+      "learning_rate": 1e-06,
+      "loss": 0.5575,
+      "step": 55
+    },
+    {
+      "epoch": 0.2228855721393035,
+      "grad_norm": 0.31936174631118774,
+      "learning_rate": 1e-06,
+      "loss": 0.6562,
+      "step": 56
+    },
+    {
+      "epoch": 0.22686567164179106,
+      "grad_norm": 0.30099964141845703,
+      "learning_rate": 1e-06,
+      "loss": 0.5912,
+      "step": 57
+    },
+    {
+      "epoch": 0.2308457711442786,
+      "grad_norm": 0.30249732732772827,
+      "learning_rate": 1e-06,
+      "loss": 0.657,
+      "step": 58
+    },
+    {
+      "epoch": 0.23482587064676616,
+      "grad_norm": 0.28535589575767517,
+      "learning_rate": 1e-06,
+      "loss": 0.5827,
+      "step": 59
+    },
+    {
+      "epoch": 0.23880597014925373,
+      "grad_norm": 0.2907682955265045,
+      "learning_rate": 1e-06,
+      "loss": 0.5745,
+      "step": 60
+    },
+    {
+      "epoch": 0.2427860696517413,
+      "grad_norm": 0.2832544445991516,
+      "learning_rate": 1e-06,
+      "loss": 0.5534,
+      "step": 61
+    },
+    {
+      "epoch": 0.24676616915422886,
+      "grad_norm": 0.2882274389266968,
+      "learning_rate": 1e-06,
+      "loss": 0.5717,
+      "step": 62
+    },
+    {
+      "epoch": 0.2507462686567164,
+      "grad_norm": 0.28751009702682495,
+      "learning_rate": 1e-06,
+      "loss": 0.5915,
+      "step": 63
+    },
+    {
+      "epoch": 0.25472636815920396,
+      "grad_norm": 0.2818026542663574,
+      "learning_rate": 1e-06,
+      "loss": 0.5793,
+      "step": 64
+    },
+    {
+      "epoch": 0.25870646766169153,
+      "grad_norm": 0.29114875197410583,
+      "learning_rate": 1e-06,
+      "loss": 0.5577,
+      "step": 65
+    },
+    {
+      "epoch": 0.2626865671641791,
+      "grad_norm": 0.3001895546913147,
+      "learning_rate": 1e-06,
+      "loss": 0.5792,
+      "step": 66
+    },
+    {
+      "epoch": 0.26666666666666666,
+      "grad_norm": 0.28489118814468384,
+      "learning_rate": 1e-06,
+      "loss": 0.6217,
+      "step": 67
+    },
+    {
+      "epoch": 0.27064676616915423,
+      "grad_norm": 0.27548784017562866,
+      "learning_rate": 1e-06,
+      "loss": 0.603,
+      "step": 68
+    },
+    {
+      "epoch": 0.2746268656716418,
+      "grad_norm": 0.2983139455318451,
+      "learning_rate": 1e-06,
+      "loss": 0.6069,
+      "step": 69
+    },
+    {
+      "epoch": 0.27860696517412936,
+      "grad_norm": 0.2885805070400238,
+      "learning_rate": 1e-06,
+      "loss": 0.6058,
+      "step": 70
+    },
+    {
+      "epoch": 0.28258706467661693,
+      "grad_norm": 0.28651854395866394,
+      "learning_rate": 1e-06,
+      "loss": 0.5814,
+      "step": 71
+    },
+    {
+      "epoch": 0.2865671641791045,
+      "grad_norm": 0.2910130023956299,
+      "learning_rate": 1e-06,
+      "loss": 0.6039,
+      "step": 72
+    },
+    {
+      "epoch": 0.29054726368159206,
+      "grad_norm": 0.2883201241493225,
+      "learning_rate": 1e-06,
+      "loss": 0.586,
+      "step": 73
+    },
+    {
+      "epoch": 0.2945273631840796,
+      "grad_norm": 0.27827897667884827,
+      "learning_rate": 1e-06,
+      "loss": 0.5844,
+      "step": 74
+    },
+    {
+      "epoch": 0.29850746268656714,
+      "grad_norm": 0.2674331068992615,
+      "learning_rate": 1e-06,
+      "loss": 0.5966,
+      "step": 75
+    },
+    {
+      "epoch": 0.3024875621890547,
+      "grad_norm": 0.27721738815307617,
+      "learning_rate": 1e-06,
+      "loss": 0.5651,
+      "step": 76
+    },
+    {
+      "epoch": 0.30646766169154227,
+      "grad_norm": 0.29553672671318054,
+      "learning_rate": 1e-06,
+      "loss": 0.5578,
+      "step": 77
+    },
+    {
+      "epoch": 0.31044776119402984,
+      "grad_norm": 0.27353787422180176,
+      "learning_rate": 1e-06,
+      "loss": 0.5778,
+      "step": 78
+    },
+    {
+      "epoch": 0.3144278606965174,
+      "grad_norm": 0.2708923816680908,
+      "learning_rate": 1e-06,
+      "loss": 0.5637,
+      "step": 79
+    },
+    {
+      "epoch": 0.31840796019900497,
+      "grad_norm": 0.2771095931529999,
+      "learning_rate": 1e-06,
+      "loss": 0.5421,
+      "step": 80
+    },
+    {
+      "epoch": 0.32238805970149254,
+      "grad_norm": 0.28794559836387634,
+      "learning_rate": 1e-06,
+      "loss": 0.56,
+      "step": 81
+    },
+    {
+      "epoch": 0.3263681592039801,
+      "grad_norm": 0.27953365445137024,
+      "learning_rate": 1e-06,
+      "loss": 0.5943,
+      "step": 82
+    },
+    {
+      "epoch": 0.33034825870646767,
+      "grad_norm": 0.2918912470340729,
+      "learning_rate": 1e-06,
+      "loss": 0.5797,
+      "step": 83
+    },
+    {
+      "epoch": 0.33432835820895523,
+      "grad_norm": 0.29445740580558777,
+      "learning_rate": 1e-06,
+      "loss": 0.5675,
+      "step": 84
+    },
+    {
+      "epoch": 0.3383084577114428,
+      "grad_norm": 0.2901161313056946,
+      "learning_rate": 1e-06,
+      "loss": 0.5775,
+      "step": 85
+    },
+    {
+      "epoch": 0.34228855721393037,
+      "grad_norm": 0.27226191759109497,
+      "learning_rate": 1e-06,
+      "loss": 0.5638,
+      "step": 86
+    },
+    {
+      "epoch": 0.34626865671641793,
+      "grad_norm": 0.28128597140312195,
+      "learning_rate": 1e-06,
+      "loss": 0.5591,
+      "step": 87
+    },
+    {
+      "epoch": 0.3502487562189055,
+      "grad_norm": 0.2813471853733063,
+      "learning_rate": 1e-06,
+      "loss": 0.5989,
+      "step": 88
+    },
+    {
+      "epoch": 0.354228855721393,
+      "grad_norm": 0.2899133265018463,
+      "learning_rate": 1e-06,
+      "loss": 0.584,
+      "step": 89
+    },
+    {
+      "epoch": 0.3582089552238806,
+      "grad_norm": 0.2919646203517914,
+      "learning_rate": 1e-06,
+      "loss": 0.5764,
+      "step": 90
+    },
+    {
+      "epoch": 0.36218905472636814,
+      "grad_norm": 0.2885926365852356,
+      "learning_rate": 1e-06,
+      "loss": 0.5623,
+      "step": 91
+    },
+    {
+      "epoch": 0.3661691542288557,
+      "grad_norm": 0.28255367279052734,
+      "learning_rate": 1e-06,
+      "loss": 0.6061,
+      "step": 92
+    },
+    {
+      "epoch": 0.3701492537313433,
+      "grad_norm": 0.2776722013950348,
+      "learning_rate": 1e-06,
+      "loss": 0.588,
+      "step": 93
+    },
+    {
+      "epoch": 0.37412935323383084,
+      "grad_norm": 0.3004148304462433,
+      "learning_rate": 1e-06,
+      "loss": 0.6002,
+      "step": 94
+    },
+    {
+      "epoch": 0.3781094527363184,
+      "grad_norm": 0.2883853316307068,
+      "learning_rate": 1e-06,
+      "loss": 0.5886,
+      "step": 95
+    },
+    {
+      "epoch": 0.382089552238806,
+      "grad_norm": 0.2858606278896332,
+      "learning_rate": 1e-06,
+      "loss": 0.546,
+      "step": 96
+    },
+    {
+      "epoch": 0.38606965174129354,
+      "grad_norm": 0.30112016201019287,
+      "learning_rate": 1e-06,
+      "loss": 0.5814,
+      "step": 97
+    },
+    {
+      "epoch": 0.3900497512437811,
+      "grad_norm": 0.2831226587295532,
+      "learning_rate": 1e-06,
+      "loss": 0.5411,
+      "step": 98
+    },
+    {
+      "epoch": 0.3940298507462687,
+      "grad_norm": 0.3117291331291199,
+      "learning_rate": 1e-06,
+      "loss": 0.6567,
+      "step": 99
+    },
+    {
+      "epoch": 0.39800995024875624,
+      "grad_norm": 0.2813672125339508,
+      "learning_rate": 1e-06,
+      "loss": 0.5674,
+      "step": 100
+    },
+    {
+      "epoch": 0.4019900497512438,
+      "grad_norm": 0.2731095850467682,
+      "learning_rate": 1e-06,
+      "loss": 0.5819,
+      "step": 101
+    },
+    {
+      "epoch": 0.4059701492537313,
+      "grad_norm": 0.29545432329177856,
+      "learning_rate": 1e-06,
+      "loss": 0.5966,
+      "step": 102
+    },
+    {
+      "epoch": 0.4099502487562189,
+      "grad_norm": 0.26830869913101196,
+      "learning_rate": 1e-06,
+      "loss": 0.5747,
+      "step": 103
+    },
+    {
+      "epoch": 0.41393034825870645,
+      "grad_norm": 0.30151620507240295,
+      "learning_rate": 1e-06,
+      "loss": 0.6733,
+      "step": 104
+    },
+    {
+      "epoch": 0.417910447761194,
+      "grad_norm": 0.2833845317363739,
+      "learning_rate": 1e-06,
+      "loss": 0.595,
+      "step": 105
+    },
+    {
+      "epoch": 0.4218905472636816,
+      "grad_norm": 0.27560508251190186,
+      "learning_rate": 1e-06,
+      "loss": 0.5554,
+      "step": 106
+    },
+    {
+      "epoch": 0.42587064676616915,
+      "grad_norm": 0.3009320795536041,
+      "learning_rate": 1e-06,
+      "loss": 0.5698,
+      "step": 107
+    },
+    {
+      "epoch": 0.4298507462686567,
+      "grad_norm": 0.2834017872810364,
+      "learning_rate": 1e-06,
+      "loss": 0.5904,
+      "step": 108
+    },
+    {
+      "epoch": 0.4338308457711443,
+      "grad_norm": 0.27971693873405457,
+      "learning_rate": 1e-06,
+      "loss": 0.5555,
+      "step": 109
+    },
+    {
+      "epoch": 0.43781094527363185,
+      "grad_norm": 0.27217191457748413,
+      "learning_rate": 1e-06,
+      "loss": 0.5594,
+      "step": 110
+    },
+    {
+      "epoch": 0.4417910447761194,
+      "grad_norm": 0.28083258867263794,
+      "learning_rate": 1e-06,
+      "loss": 0.5766,
+      "step": 111
+    },
+    {
+      "epoch": 0.445771144278607,
+      "grad_norm": 0.29860496520996094,
+      "learning_rate": 1e-06,
+      "loss": 0.5622,
+      "step": 112
+    },
+    {
+      "epoch": 0.44975124378109455,
+      "grad_norm": 0.2839198410511017,
+      "learning_rate": 1e-06,
+      "loss": 0.5441,
+      "step": 113
+    },
+    {
+      "epoch": 0.4537313432835821,
+      "grad_norm": 0.28053733706474304,
+      "learning_rate": 1e-06,
+      "loss": 0.545,
+      "step": 114
+    },
+    {
+      "epoch": 0.4577114427860697,
+      "grad_norm": 0.28944674134254456,
+      "learning_rate": 1e-06,
+      "loss": 0.5414,
+      "step": 115
+    },
+    {
+      "epoch": 0.4616915422885572,
+      "grad_norm": 1.7277145385742188,
+      "learning_rate": 1e-06,
+      "loss": 0.5376,
+      "step": 116
+    },
+    {
+      "epoch": 0.46567164179104475,
+      "grad_norm": 0.26408037543296814,
+      "learning_rate": 1e-06,
+      "loss": 0.5273,
+      "step": 117
+    },
+    {
+      "epoch": 0.4696517412935323,
+      "grad_norm": 0.2752501666545868,
+      "learning_rate": 1e-06,
+      "loss": 0.5223,
+      "step": 118
+    },
+    {
+      "epoch": 0.4736318407960199,
+      "grad_norm": 0.31200143694877625,
+      "learning_rate": 1e-06,
+      "loss": 0.6251,
+      "step": 119
+    },
+    {
+      "epoch": 0.47761194029850745,
+      "grad_norm": 0.2889968156814575,
+      "learning_rate": 1e-06,
+      "loss": 0.548,
+      "step": 120
+    },
+    {
+      "epoch": 0.481592039800995,
+      "grad_norm": 0.272776335477829,
+      "learning_rate": 1e-06,
+      "loss": 0.5353,
+      "step": 121
+    },
+    {
+      "epoch": 0.4855721393034826,
+      "grad_norm": 0.29524046182632446,
+      "learning_rate": 1e-06,
+      "loss": 0.5834,
+      "step": 122
+    },
+    {
+      "epoch": 0.48955223880597015,
+      "grad_norm": 0.2750682830810547,
+      "learning_rate": 1e-06,
+      "loss": 0.5769,
+      "step": 123
+    },
+    {
+      "epoch": 0.4935323383084577,
+      "grad_norm": 0.28290194272994995,
+      "learning_rate": 1e-06,
+      "loss": 0.5749,
+      "step": 124
+    },
+    {
+      "epoch": 0.4975124378109453,
+      "grad_norm": 0.2784881889820099,
+      "learning_rate": 1e-06,
+      "loss": 0.5675,
+      "step": 125
+    },
+    {
+      "epoch": 0.5014925373134328,
+      "grad_norm": 0.28352829813957214,
+      "learning_rate": 1e-06,
+      "loss": 0.5544,
+      "step": 126
+    },
+    {
+      "epoch": 0.5054726368159204,
+      "grad_norm": 0.4005744457244873,
+      "learning_rate": 1e-06,
+      "loss": 0.4916,
+      "step": 127
+    },
+    {
+      "epoch": 0.5094527363184079,
+      "grad_norm": 0.2907276153564453,
+      "learning_rate": 1e-06,
+      "loss": 0.5842,
+      "step": 128
+    },
+    {
+      "epoch": 0.5134328358208955,
+      "grad_norm": 0.27371498942375183,
+      "learning_rate": 1e-06,
+      "loss": 0.5298,
+      "step": 129
+    },
+    {
+      "epoch": 0.5174129353233831,
+      "grad_norm": 0.268046110868454,
+      "learning_rate": 1e-06,
+      "loss": 0.5488,
+      "step": 130
+    },
+    {
+      "epoch": 0.5213930348258706,
+      "grad_norm": 0.27211833000183105,
+      "learning_rate": 1e-06,
+      "loss": 0.548,
+      "step": 131
+    },
+    {
+      "epoch": 0.5253731343283582,
+      "grad_norm": 0.28055205941200256,
+      "learning_rate": 1e-06,
+      "loss": 0.5506,
+      "step": 132
+    },
+    {
+      "epoch": 0.5293532338308458,
+      "grad_norm": 0.28549808263778687,
+      "learning_rate": 1e-06,
+      "loss": 0.5514,
+      "step": 133
+    },
+    {
+      "epoch": 0.5333333333333333,
+      "grad_norm": 0.2873031198978424,
+      "learning_rate": 1e-06,
+      "loss": 0.5868,
+      "step": 134
+    },
+    {
+      "epoch": 0.5373134328358209,
+      "grad_norm": 0.26007169485092163,
+      "learning_rate": 1e-06,
+      "loss": 0.4835,
+      "step": 135
+    },
+    {
+      "epoch": 0.5412935323383085,
+      "grad_norm": 0.27581357955932617,
+      "learning_rate": 1e-06,
+      "loss": 0.515,
+      "step": 136
+    },
+    {
+      "epoch": 0.545273631840796,
+      "grad_norm": 0.2559061050415039,
+      "learning_rate": 1e-06,
+      "loss": 0.4964,
+      "step": 137
+    },
+    {
+      "epoch": 0.5492537313432836,
+      "grad_norm": 0.26830771565437317,
+      "learning_rate": 1e-06,
+      "loss": 0.5285,
+      "step": 138
+    },
+    {
+      "epoch": 0.5532338308457712,
+      "grad_norm": 0.2840443253517151,
+      "learning_rate": 1e-06,
+      "loss": 0.5135,
+      "step": 139
+    },
+    {
+      "epoch": 0.5572139303482587,
+      "grad_norm": 0.27029529213905334,
+      "learning_rate": 1e-06,
+      "loss": 0.5273,
+      "step": 140
+    },
+    {
+      "epoch": 0.5611940298507463,
+      "grad_norm": 0.2841308116912842,
+      "learning_rate": 1e-06,
+      "loss": 0.5804,
+      "step": 141
+    },
+    {
+      "epoch": 0.5651741293532339,
+      "grad_norm": 0.28251802921295166,
+      "learning_rate": 1e-06,
+      "loss": 0.5554,
+      "step": 142
+    },
+    {
+      "epoch": 0.5691542288557214,
+      "grad_norm": 0.2795189321041107,
+      "learning_rate": 1e-06,
+      "loss": 0.5299,
+      "step": 143
+    },
+    {
+      "epoch": 0.573134328358209,
+      "grad_norm": 0.29494765400886536,
+      "learning_rate": 1e-06,
+      "loss": 0.5866,
+      "step": 144
+    },
+    {
+      "epoch": 0.5771144278606966,
+      "grad_norm": 0.26426634192466736,
+      "learning_rate": 1e-06,
+      "loss": 0.4921,
+      "step": 145
+    },
+    {
+      "epoch": 0.5810945273631841,
+      "grad_norm": 0.27161064743995667,
+      "learning_rate": 1e-06,
+      "loss": 0.5156,
+      "step": 146
+    },
+    {
+      "epoch": 0.5850746268656717,
+      "grad_norm": 0.2546272277832031,
+      "learning_rate": 1e-06,
+      "loss": 0.4764,
+      "step": 147
+    },
+    {
+      "epoch": 0.5890547263681593,
+      "grad_norm": 0.26822739839553833,
+      "learning_rate": 1e-06,
+      "loss": 0.5317,
+      "step": 148
+    },
+    {
+      "epoch": 0.5930348258706468,
+      "grad_norm": 0.28637799620628357,
+      "learning_rate": 1e-06,
+      "loss": 0.5488,
+      "step": 149
+    },
+    {
+      "epoch": 0.5970149253731343,
+      "grad_norm": 0.29014742374420166,
+      "learning_rate": 1e-06,
+      "loss": 0.5567,
+      "step": 150
+    },
+    {
+      "epoch": 0.6009950248756218,
+      "grad_norm": 0.2683526873588562,
+      "learning_rate": 1e-06,
+      "loss": 0.5511,
+      "step": 151
+    },
+    {
+      "epoch": 0.6049751243781094,
+      "grad_norm": 0.27193310856819153,
+      "learning_rate": 1e-06,
+      "loss": 0.5253,
+      "step": 152
+    },
+    {
+      "epoch": 0.608955223880597,
+      "grad_norm": 0.808740496635437,
+      "learning_rate": 1e-06,
+      "loss": 0.5254,
+      "step": 153
+    },
+    {
+      "epoch": 0.6129353233830845,
+      "grad_norm": 0.2881057858467102,
+      "learning_rate": 1e-06,
+      "loss": 0.5668,
+      "step": 154
+    },
+    {
+      "epoch": 0.6169154228855721,
+      "grad_norm": 0.28654593229293823,
+      "learning_rate": 1e-06,
+      "loss": 0.6033,
+      "step": 155
+    },
+    {
+      "epoch": 0.6208955223880597,
+      "grad_norm": 0.29203689098358154,
+      "learning_rate": 1e-06,
+      "loss": 0.5548,
+      "step": 156
+    },
+    {
+      "epoch": 0.6248756218905472,
+      "grad_norm": 0.2731221318244934,
+      "learning_rate": 1e-06,
+      "loss": 0.4972,
+      "step": 157
+    },
+    {
+      "epoch": 0.6288557213930348,
+      "grad_norm": 0.27775096893310547,
+      "learning_rate": 1e-06,
+      "loss": 0.4988,
+      "step": 158
+    },
+    {
+      "epoch": 0.6328358208955224,
+      "grad_norm": 0.2725508511066437,
+      "learning_rate": 1e-06,
+      "loss": 0.5338,
+      "step": 159
+    },
+    {
+      "epoch": 0.6368159203980099,
+      "grad_norm": 0.2905254364013672,
+      "learning_rate": 1e-06,
+      "loss": 0.5502,
+      "step": 160
+    },
+    {
+      "epoch": 0.6407960199004975,
+      "grad_norm": 0.2800814211368561,
+      "learning_rate": 1e-06,
+      "loss": 0.524,
+      "step": 161
+    },
+    {
+      "epoch": 0.6447761194029851,
+      "grad_norm": 0.29800140857696533,
+      "learning_rate": 1e-06,
+      "loss": 0.5658,
+      "step": 162
+    },
+    {
+      "epoch": 0.6487562189054726,
+      "grad_norm": 0.289701372385025,
+      "learning_rate": 1e-06,
+      "loss": 0.5322,
+      "step": 163
+    },
+    {
+      "epoch": 0.6527363184079602,
+      "grad_norm": 0.3027022182941437,
+      "learning_rate": 1e-06,
+      "loss": 0.5575,
+      "step": 164
+    },
+    {
+      "epoch": 0.6567164179104478,
+      "grad_norm": 0.29252082109451294,
+      "learning_rate": 1e-06,
+      "loss": 0.559,
+      "step": 165
+    },
+    {
+      "epoch": 0.6606965174129353,
+      "grad_norm": 0.2698836326599121,
+      "learning_rate": 1e-06,
+      "loss": 0.502,
+      "step": 166
+    },
+    {
+      "epoch": 0.6646766169154229,
+      "grad_norm": 0.27977052330970764,
+      "learning_rate": 1e-06,
+      "loss": 0.483,
+      "step": 167
+    },
+    {
+      "epoch": 0.6686567164179105,
+      "grad_norm": 0.2937949001789093,
+      "learning_rate": 1e-06,
+      "loss": 0.5613,
+      "step": 168
+    },
+    {
+      "epoch": 0.672636815920398,
+      "grad_norm": 0.2905248701572418,
+      "learning_rate": 1e-06,
+      "loss": 0.5369,
+      "step": 169
+    },
+    {
+      "epoch": 0.6766169154228856,
+      "grad_norm": 0.27426132559776306,
+      "learning_rate": 1e-06,
+      "loss": 0.4985,
+      "step": 170
+    },
+    {
+      "epoch": 0.6805970149253732,
+      "grad_norm": 0.2826381325721741,
+      "learning_rate": 1e-06,
+      "loss": 0.525,
+      "step": 171
+    },
+    {
+      "epoch": 0.6845771144278607,
+      "grad_norm": 0.2896779477596283,
+      "learning_rate": 1e-06,
+      "loss": 0.5503,
+      "step": 172
+    },
+    {
+      "epoch": 0.6885572139303483,
+      "grad_norm": 0.27713751792907715,
+      "learning_rate": 1e-06,
+      "loss": 0.5198,
+      "step": 173
+    },
+    {
+      "epoch": 0.6925373134328359,
+      "grad_norm": 0.29340362548828125,
+      "learning_rate": 1e-06,
+      "loss": 0.5588,
+      "step": 174
+    },
+    {
+      "epoch": 0.6965174129353234,
+      "grad_norm": 0.26327288150787354,
+      "learning_rate": 1e-06,
+      "loss": 0.5044,
+      "step": 175
+    },
+    {
+      "epoch": 0.700497512437811,
+      "grad_norm": 0.2810980975627899,
+      "learning_rate": 1e-06,
+      "loss": 0.5336,
+      "step": 176
+    },
+    {
+      "epoch": 0.7044776119402985,
+      "grad_norm": 0.2798118591308594,
+      "learning_rate": 1e-06,
+      "loss": 0.5623,
+      "step": 177
+    },
+    {
+      "epoch": 0.708457711442786,
+      "grad_norm": 0.27893081307411194,
+      "learning_rate": 1e-06,
+      "loss": 0.5098,
+      "step": 178
+    },
+    {
+      "epoch": 0.7124378109452736,
+      "grad_norm": 0.2879588305950165,
+      "learning_rate": 1e-06,
+      "loss": 0.5581,
+      "step": 179
+    },
+    {
+      "epoch": 0.7164179104477612,
+      "grad_norm": 0.2735341787338257,
+      "learning_rate": 1e-06,
+      "loss": 0.4972,
+      "step": 180
+    },
+    {
+      "epoch": 0.7203980099502487,
+      "grad_norm": 0.28305062651634216,
+      "learning_rate": 1e-06,
+      "loss": 0.5198,
+      "step": 181
+    },
+    {
+      "epoch": 0.7243781094527363,
+      "grad_norm": 0.2881869375705719,
+      "learning_rate": 1e-06,
+      "loss": 0.5236,
+      "step": 182
+    },
+    {
+      "epoch": 0.7283582089552239,
+      "grad_norm": 0.30144739151000977,
+      "learning_rate": 1e-06,
+      "loss": 0.5406,
+      "step": 183
+    },
+    {
+      "epoch": 0.7323383084577114,
+      "grad_norm": 0.28926968574523926,
+      "learning_rate": 1e-06,
+      "loss": 0.5571,
+      "step": 184
+    },
+    {
+      "epoch": 0.736318407960199,
+      "grad_norm": 0.29733872413635254,
+      "learning_rate": 1e-06,
+      "loss": 0.6002,
+      "step": 185
+    },
+    {
+      "epoch": 0.7402985074626866,
+      "grad_norm": 0.28750744462013245,
+      "learning_rate": 1e-06,
+      "loss": 0.5629,
+      "step": 186
+    },
+    {
+      "epoch": 0.7442786069651741,
+      "grad_norm": 0.25272336602211,
+      "learning_rate": 1e-06,
+      "loss": 0.5315,
+      "step": 187
+    },
+    {
+      "epoch": 0.7482587064676617,
+      "grad_norm": 0.3123670220375061,
+      "learning_rate": 1e-06,
+      "loss": 0.5518,
+      "step": 188
+    },
+    {
+      "epoch": 0.7522388059701492,
+      "grad_norm": 0.287804514169693,
+      "learning_rate": 1e-06,
+      "loss": 0.5308,
+      "step": 189
+    },
+    {
+      "epoch": 0.7562189054726368,
+      "grad_norm": 0.27801209688186646,
+      "learning_rate": 1e-06,
+      "loss": 0.4952,
+      "step": 190
+    },
+    {
+      "epoch": 0.7601990049751244,
+      "grad_norm": 0.29395267367362976,
+      "learning_rate": 1e-06,
+      "loss": 0.5072,
+      "step": 191
+    },
+    {
+      "epoch": 0.764179104477612,
+      "grad_norm": 0.29356127977371216,
+      "learning_rate": 1e-06,
+      "loss": 0.5451,
+      "step": 192
+    },
+    {
+      "epoch": 0.7681592039800995,
+      "grad_norm": 0.27663421630859375,
+      "learning_rate": 1e-06,
+      "loss": 0.5338,
+      "step": 193
+    },
+    {
+      "epoch": 0.7721393034825871,
+      "grad_norm": 0.27448275685310364,
+      "learning_rate": 1e-06,
+      "loss": 0.5382,
+      "step": 194
+    },
+    {
+      "epoch": 0.7761194029850746,
+      "grad_norm": 0.2774457037448883,
+      "learning_rate": 1e-06,
+      "loss": 0.4961,
+      "step": 195
+    },
+    {
+      "epoch": 0.7800995024875622,
+      "grad_norm": 0.30790749192237854,
+      "learning_rate": 1e-06,
+      "loss": 0.5553,
+      "step": 196
+    },
+    {
+      "epoch": 0.7840796019900498,
+      "grad_norm": 0.30943363904953003,
+      "learning_rate": 1e-06,
+      "loss": 0.5514,
+      "step": 197
+    },
+    {
+      "epoch": 0.7880597014925373,
+      "grad_norm": 0.265715092420578,
+      "learning_rate": 1e-06,
+      "loss": 0.494,
+      "step": 198
+    },
+    {
+      "epoch": 0.7920398009950249,
+      "grad_norm": 0.28460168838500977,
+      "learning_rate": 1e-06,
+      "loss": 0.5318,
+      "step": 199
+    },
+    {
+      "epoch": 0.7960199004975125,
+      "grad_norm": 0.2925533354282379,
+      "learning_rate": 1e-06,
+      "loss": 0.5197,
+      "step": 200
+    },
+    {
+      "epoch": 0.8,
+      "grad_norm": 0.2781723141670227,
+      "learning_rate": 1e-06,
+      "loss": 0.4839,
+      "step": 201
+    },
+    {
+      "epoch": 0.8039800995024876,
+      "grad_norm": 0.28367018699645996,
+      "learning_rate": 1e-06,
+      "loss": 0.5533,
+      "step": 202
+    },
+    {
+      "epoch": 0.8079601990049752,
+      "grad_norm": 0.2904638350009918,
+      "learning_rate": 1e-06,
+      "loss": 0.5128,
+      "step": 203
+    },
+    {
+      "epoch": 0.8119402985074626,
+      "grad_norm": 0.2869066596031189,
+      "learning_rate": 1e-06,
+      "loss": 0.5842,
+      "step": 204
+    },
+    {
+      "epoch": 0.8159203980099502,
+      "grad_norm": 0.2981327176094055,
+      "learning_rate": 1e-06,
+      "loss": 0.6124,
+      "step": 205
+    },
+    {
+      "epoch": 0.8199004975124378,
+      "grad_norm": 0.3040124177932739,
+      "learning_rate": 1e-06,
+      "loss": 0.5407,
+      "step": 206
+    },
+    {
+      "epoch": 0.8238805970149253,
+      "grad_norm": 0.283186674118042,
+      "learning_rate": 1e-06,
+      "loss": 0.5559,
+      "step": 207
+    },
+    {
+      "epoch": 0.8278606965174129,
+      "grad_norm": 0.29206421971321106,
+      "learning_rate": 1e-06,
+      "loss": 0.5143,
+      "step": 208
+    },
+    {
+      "epoch": 0.8318407960199005,
+      "grad_norm": 0.2698039412498474,
+      "learning_rate": 1e-06,
+      "loss": 0.5092,
+      "step": 209
+    },
+    {
+      "epoch": 0.835820895522388,
+      "grad_norm": 0.3050399720668793,
+      "learning_rate": 1e-06,
+      "loss": 0.5436,
+      "step": 210
+    },
+    {
+      "epoch": 0.8398009950248756,
+      "grad_norm": 0.2690124809741974,
+      "learning_rate": 1e-06,
+      "loss": 0.5118,
+      "step": 211
+    },
+    {
+      "epoch": 0.8437810945273632,
+      "grad_norm": 0.2941598892211914,
+      "learning_rate": 1e-06,
+      "loss": 0.5776,
+      "step": 212
+    },
+    {
+      "epoch": 0.8477611940298507,
+      "grad_norm": 0.267484188079834,
+      "learning_rate": 1e-06,
+      "loss": 0.481,
+      "step": 213
+    },
+    {
+      "epoch": 0.8517412935323383,
+      "grad_norm": 0.3034264147281647,
+      "learning_rate": 1e-06,
+      "loss": 0.5479,
+      "step": 214
+    },
+    {
+      "epoch": 0.8557213930348259,
+      "grad_norm": 0.29359570145606995,
+      "learning_rate": 1e-06,
+      "loss": 0.5369,
+      "step": 215
+    },
+    {
+      "epoch": 0.8597014925373134,
+      "grad_norm": 0.2907046377658844,
+      "learning_rate": 1e-06,
+      "loss": 0.5127,
+      "step": 216
+    },
+    {
+      "epoch": 0.863681592039801,
+      "grad_norm": 0.2787851095199585,
+      "learning_rate": 1e-06,
+      "loss": 0.5398,
+      "step": 217
+    },
+    {
+      "epoch": 0.8676616915422886,
+      "grad_norm": 0.29438599944114685,
+      "learning_rate": 1e-06,
+      "loss": 0.5337,
+      "step": 218
+    },
+    {
+      "epoch": 0.8716417910447761,
+      "grad_norm": 0.2769269645214081,
+      "learning_rate": 1e-06,
+      "loss": 0.5016,
+      "step": 219
+    },
+    {
+      "epoch": 0.8756218905472637,
+      "grad_norm": 0.27982795238494873,
+      "learning_rate": 1e-06,
+      "loss": 0.5489,
+      "step": 220
+    },
+    {
+      "epoch": 0.8796019900497513,
+      "grad_norm": 0.2620881497859955,
+      "learning_rate": 1e-06,
+      "loss": 0.4893,
+      "step": 221
+    },
+    {
+      "epoch": 0.8835820895522388,
+      "grad_norm": 0.2869341969490051,
+      "learning_rate": 1e-06,
+      "loss": 0.5365,
+      "step": 222
+    },
+    {
+      "epoch": 0.8875621890547264,
+      "grad_norm": 0.28541088104248047,
+      "learning_rate": 1e-06,
+      "loss": 0.5234,
+      "step": 223
+    },
+    {
+      "epoch": 0.891542288557214,
+      "grad_norm": 0.2907220125198364,
+      "learning_rate": 1e-06,
+      "loss": 0.5224,
+      "step": 224
+    },
+    {
+      "epoch": 0.8955223880597015,
+      "grad_norm": 0.3106067180633545,
+      "learning_rate": 1e-06,
+      "loss": 0.5616,
+      "step": 225
+    },
+    {
+      "epoch": 0.8995024875621891,
+      "grad_norm": 0.2765253782272339,
+      "learning_rate": 1e-06,
+      "loss": 0.4978,
+      "step": 226
+    },
+    {
+      "epoch": 0.9034825870646767,
+      "grad_norm": 0.2780396342277527,
+      "learning_rate": 1e-06,
+      "loss": 0.5197,
+      "step": 227
+    },
+    {
+      "epoch": 0.9074626865671642,
+      "grad_norm": 0.2735743224620819,
+      "learning_rate": 1e-06,
+      "loss": 0.5081,
+      "step": 228
+    },
+    {
+      "epoch": 0.9114427860696518,
+      "grad_norm": 0.2986888289451599,
+      "learning_rate": 1e-06,
+      "loss": 0.504,
+      "step": 229
+    },
+    {
+      "epoch": 0.9154228855721394,
+      "grad_norm": 0.2711998522281647,
+      "learning_rate": 1e-06,
+      "loss": 0.5258,
+      "step": 230
+    },
+    {
+      "epoch": 0.9194029850746268,
+      "grad_norm": 0.27429237961769104,
+      "learning_rate": 1e-06,
+      "loss": 0.4983,
+      "step": 231
+    },
+    {
+      "epoch": 0.9233830845771144,
+      "grad_norm": 0.28108328580856323,
+      "learning_rate": 1e-06,
+      "loss": 0.5817,
+      "step": 232
+    },
+    {
+      "epoch": 0.9273631840796019,
+      "grad_norm": 0.273513525724411,
+      "learning_rate": 1e-06,
+      "loss": 0.5024,
+      "step": 233
+    },
+    {
+      "epoch": 0.9313432835820895,
+      "grad_norm": 0.2856132686138153,
+      "learning_rate": 1e-06,
+      "loss": 0.5257,
+      "step": 234
+    },
+    {
+      "epoch": 0.9353233830845771,
+      "grad_norm": 0.2727264165878296,
+      "learning_rate": 1e-06,
+      "loss": 0.4796,
+      "step": 235
+    },
+    {
+      "epoch": 0.9393034825870646,
+      "grad_norm": 0.2819795608520508,
+      "learning_rate": 1e-06,
+      "loss": 0.4993,
+      "step": 236
+    },
+    {
+      "epoch": 0.9432835820895522,
+      "grad_norm": 0.29131144285202026,
+      "learning_rate": 1e-06,
+      "loss": 0.492,
+      "step": 237
+    },
+    {
+      "epoch": 0.9472636815920398,
+      "grad_norm": 0.29098305106163025,
+      "learning_rate": 1e-06,
+      "loss": 0.5257,
+      "step": 238
+    },
+    {
+      "epoch": 0.9512437810945273,
+      "grad_norm": 0.2734336853027344,
+      "learning_rate": 1e-06,
+      "loss": 0.487,
+      "step": 239
+    },
+    {
+      "epoch": 0.9552238805970149,
+      "grad_norm": 0.26648443937301636,
+      "learning_rate": 1e-06,
+      "loss": 0.4864,
+      "step": 240
+    },
+    {
+      "epoch": 0.9592039800995025,
+      "grad_norm": 0.2583979666233063,
+      "learning_rate": 1e-06,
+      "loss": 0.4622,
+      "step": 241
+    },
+    {
+      "epoch": 0.96318407960199,
+      "grad_norm": 0.26614758372306824,
+      "learning_rate": 1e-06,
+      "loss": 0.5096,
+      "step": 242
+    },
+    {
+      "epoch": 0.9671641791044776,
+      "grad_norm": 0.25741949677467346,
+      "learning_rate": 1e-06,
+      "loss": 0.4801,
+      "step": 243
+    },
+    {
+      "epoch": 0.9711442786069652,
+      "grad_norm": 0.2788185477256775,
+      "learning_rate": 1e-06,
+      "loss": 0.4905,
+      "step": 244
+    },
+    {
+      "epoch": 0.9751243781094527,
+      "grad_norm": 0.282296746969223,
+      "learning_rate": 1e-06,
+      "loss": 0.5223,
+      "step": 245
+    },
+    {
+      "epoch": 0.9791044776119403,
+      "grad_norm": 0.2750173509120941,
+      "learning_rate": 1e-06,
+      "loss": 0.5051,
+      "step": 246
+    },
+    {
+      "epoch": 0.9830845771144279,
+      "grad_norm": 0.2807095944881439,
+      "learning_rate": 1e-06,
+      "loss": 0.503,
+      "step": 247
+    },
+    {
+      "epoch": 0.9870646766169154,
+      "grad_norm": 0.2665058970451355,
+      "learning_rate": 1e-06,
+      "loss": 0.4514,
+      "step": 248
+    },
+    {
+      "epoch": 0.991044776119403,
+      "grad_norm": 0.26747071743011475,
+      "learning_rate": 1e-06,
+      "loss": 0.4601,
+      "step": 249
+    },
+    {
+      "epoch": 0.9950248756218906,
+      "grad_norm": 0.2884337306022644,
+      "learning_rate": 1e-06,
+      "loss": 0.4899,
+      "step": 250
+    },
+    {
+      "epoch": 0.9990049751243781,
+      "grad_norm": 0.29180482029914856,
+      "learning_rate": 1e-06,
+      "loss": 0.4998,
+      "step": 251
+    },
+    {
+      "epoch": 0.9990049751243781,
+      "step": 251,
+      "total_flos": 5.849427398046515e+16,
+      "train_loss": 0.6000438257755036,
+      "train_runtime": 5544.176,
+      "train_samples_per_second": 2.175,
+      "train_steps_per_second": 0.045
+    }
+  ],
+  "logging_steps": 1,
+  "max_steps": 251,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 10,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.849427398046515e+16,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}