adamhao123 commited on Mar 18

Commit

8936f39

verified ·

1 Parent(s): b990bc1

Model save

Browse files

Files changed (21) hide show

README.md +3 -5
all_results.json +5 -5
config.json +1 -1
model-00001-of-00014.safetensors +1 -1
model-00002-of-00014.safetensors +1 -1
model-00003-of-00014.safetensors +1 -1
model-00004-of-00014.safetensors +1 -1
model-00005-of-00014.safetensors +1 -1
model-00006-of-00014.safetensors +1 -1
model-00007-of-00014.safetensors +1 -1
model-00008-of-00014.safetensors +1 -1
model-00009-of-00014.safetensors +1 -1
model-00010-of-00014.safetensors +1 -1
model-00011-of-00014.safetensors +1 -1
model-00012-of-00014.safetensors +1 -1
model-00013-of-00014.safetensors +1 -1
model-00014-of-00014.safetensors +1 -1
tokenizer_config.json +1 -1
train_results.json +5 -5
trainer_state.json +221 -69
training_args.bin +1 -1

README.md CHANGED Viewed

@@ -1,11 +1,9 @@
 ---
 base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
-datasets: jdy_analysis
 library_name: transformers
 model_name: Qwen2.5-32B-Open-R1-Distill-jdy-ft
 tags:
 - generated_from_trainer
-- open-r1
 - trl
 - sft
 licence: license
@@ -13,7 +11,7 @@ licence: license
 # Model Card for Qwen2.5-32B-Open-R1-Distill-jdy-ft
-This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on the [jdy_analysis](https://huggingface.co/datasets/jdy_analysis) dataset.
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
 ## Training procedure
-[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/songhao9021-9uest/huggingface/runs/p45jgv2a)
 This model was trained with SFT.
@@ -39,7 +37,7 @@ This model was trained with SFT.
 - TRL: 0.16.0.dev0
 - Transformers: 4.49.0
 - Pytorch: 2.5.1
-- Datasets: 3.3.2
 - Tokenizers: 0.21.1
 ## Citations

 ---
 base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
 library_name: transformers
 model_name: Qwen2.5-32B-Open-R1-Distill-jdy-ft
 tags:
 - generated_from_trainer
 - trl
 - sft
 licence: license
 # Model Card for Qwen2.5-32B-Open-R1-Distill-jdy-ft
+This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B).
 It has been trained using [TRL](https://github.com/huggingface/trl).
 ## Quick start
 ## Training procedure
+[<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/songhao9021-9uest/huggingface/runs/0kc006oz)
 This model was trained with SFT.
 - TRL: 0.16.0.dev0
 - Transformers: 4.49.0
 - Pytorch: 2.5.1
+- Datasets: 3.4.1
 - Tokenizers: 0.21.1
 ## Citations

all_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "total_flos": 3571493896192.0,
-    "train_loss": 0.5901590967178345,
-    "train_runtime": 1292.5059,
     "train_samples": 114,
-    "train_samples_per_second": 0.309,
-    "train_steps_per_second": 0.039
 }

 {
+    "total_flos": 997309579264.0,
+    "train_loss": 0.9225217268384737,
+    "train_runtime": 1196.3929,
     "train_samples": 114,
+    "train_samples_per_second": 0.095,
+    "train_steps_per_second": 0.024
 }

config.json CHANGED Viewed

@@ -23,7 +23,7 @@
   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.49.0",
-  "use_cache": true,
   "use_sliding_window": false,
   "vocab_size": 152064
 }

   "tie_word_embeddings": false,
   "torch_dtype": "bfloat16",
   "transformers_version": "4.49.0",
+  "use_cache": false,
   "use_sliding_window": false,
   "vocab_size": 152064
 }

model-00001-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a9059ef10fa47f6e646117d9e0fa6d5a3d7d7290d9d9d7e364bf5cd3a96958df
 size 4891730992

 version https://git-lfs.github.com/spec/v1
+oid sha256:30ea6fd81c1f88660745f93421cffd7c6785cd6c5f47fa55894dd3435a6b193a
 size 4891730992

model-00002-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c8ee020ef4ea66dcce6d93396885ac25df82e7749fc203f08d71fbd39e964df
 size 4876059352

 version https://git-lfs.github.com/spec/v1
+oid sha256:f4735781f9bd6d2d8a3948981b86c8c29fe0481702083fca101f786e9aa76580
 size 4876059352

model-00003-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:52041eeb8f35d6a5c438ea2409cb484140125eae36b6e2cb96d437e9672b4923
 size 4876059384

 version https://git-lfs.github.com/spec/v1
+oid sha256:ca17dd8c94a958cb777f9ed5589d26f233e22453c64175ce4c025053b44f7dd5
 size 4876059384

model-00004-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:10d0cc3074579747ac679e13dac922d5aeafc797f0097e8b3b447ffe63493fb5
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:240ac171f8933770890036a10dac990dbd16e94ce578e3114e1d3eb3f3a55fa6
 size 4876059416

model-00005-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4ffed755b52730674b65e8acbe7e103b8a63028a28387b1decdc40987ec9fd00
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:0084f3902b8b661d77c6a40225d343c17e7c89e0c07c31a7cea35f3d598c0185
 size 4876059416

model-00006-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46319c08fe3c2ac91cdf0bb7dc3305aa378d234d23a4c4ac95423d6b8bee239d
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:36eb65d0d2707a13130d49600c62fde3fc2571714ff9fab91c196b95c99dd1c3
 size 4876059416

model-00007-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:7e6a659dd9e3ea7a2ea85748bbb7f7d3fb3eaa382a5b331a62b0ccf0d63012f5
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:001a2b0fc931a4390ff9495dc64f1b8cd1ab2969f6d9ba5a49499e9a4313ce58
 size 4876059416

model-00008-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d5a5c0288d94f24c0119817b347f08cb5d8c0fe03537691e2d4af221888cbcdb
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:cbf1c05bdb70a57582cf930ce7edd7018358b9e8ceaac8c907563a4ff560a815
 size 4876059416

model-00009-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1738e2a2d84ce42c5d251acb86e02686438afdb8ec1dde76db7775039ed975a3
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:2bea6d8bf986db1b7caf020811eb2711baed06a6d7446dd4da23206aeb1e51ea
 size 4876059416

model-00010-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3a62e7006b2e603080f3a78d2ce00ea9e3041e3ad69a7248673bd5cda146a43b
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:2612f21e4860e9ff8da23505d285d3da64db2fbf28080b1c0e22333a6482b7ac
 size 4876059416

model-00011-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ee821fa52519c75873198473bc0fadd6c7a7e128e8c9630cf77d0a4920c7c2b8
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:e7c341b6c5aea237353690fd93289404f21f82e46622bdf50d183043cca1e2c9
 size 4876059416

model-00012-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed2c6fb963768d0c0765b872624c79b9297111f860f566e149d61f663fd9e249
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:82a086a48be333c5275f10f0d69422c95864a5e220778fc61711637c1f9eb5e6
 size 4876059416

model-00013-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:46db25eae00cae2295c5afee70396ee5e92b4563e1cf03af8f5bf2e1de6a1d49
 size 4876059416

 version https://git-lfs.github.com/spec/v1
+oid sha256:9e03c63e8ddf7933571d87d012d2cd05a10969f649efa5bcbe031533ca1983fa
 size 4876059416

model-00014-of-00014.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:32a580d78e40b56ca00678a2af9fbf222e6c088f9538a69a361bf31915653c70
 size 2123397800

 version https://git-lfs.github.com/spec/v1
+oid sha256:2820cb0c9a9ba0d59f27aef52e334a4f579f51a6fbeebee077fcb06ac9801df9
 size 2123397800

tokenizer_config.json CHANGED Viewed

@@ -181,7 +181,7 @@
     }
   },
   "bos_token": "<｜begin▁of▁sentence｜>",
-  "chat_template": "\n{% for message in messages %}\n    {% if message['role'] == 'system' %}\n<|im_start|>system\n{{ message['content'] }}<|im_end|>\n    {% else %}\n<|im_start|>{{ message['role'] }}\n{{ message['content'] }}<|im_end|>\n    {% endif %}\n{% endfor %}\n",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<｜end▁of▁sentence｜>",
   "extra_special_tokens": {},

     }
   },
   "bos_token": "<｜begin▁of▁sentence｜>",
+  "chat_template": "\n{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<｜tool▁call▁end｜>'}}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}\n",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<｜end▁of▁sentence｜>",
   "extra_special_tokens": {},

train_results.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
-    "total_flos": 3571493896192.0,
-    "train_loss": 0.5901590967178345,
-    "train_runtime": 1292.5059,
     "train_samples": 114,
-    "train_samples_per_second": 0.309,
-    "train_steps_per_second": 0.039
 }

 {
+    "total_flos": 997309579264.0,
+    "train_loss": 0.9225217268384737,
+    "train_runtime": 1196.3929,
     "train_samples": 114,
+    "train_samples_per_second": 0.095,
+    "train_steps_per_second": 0.024
 }

trainer_state.json CHANGED Viewed

@@ -1,107 +1,259 @@
 {
   "best_metric": null,
   "best_model_checkpoint": null,
-  "epoch": 3.3333333333333335,
   "eval_steps": 500,
-  "global_step": 50,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.3333333333333333,
-      "grad_norm": 2.3431689739227295,
       "learning_rate": 2e-05,
-      "loss": 1.4244,
-      "mean_token_accuracy": 0.6533483505249024,
       "step": 5
     },
     {
-      "epoch": 0.6666666666666666,
-      "grad_norm": 1.3162345886230469,
-      "learning_rate": 1.9396926207859085e-05,
-      "loss": 0.8059,
-      "mean_token_accuracy": 0.7710338115692139,
       "step": 10
     },
     {
-      "epoch": 1.0,
-      "grad_norm": 1.3825139999389648,
-      "learning_rate": 1.766044443118978e-05,
-      "loss": 0.7155,
-      "mean_token_accuracy": 0.7817506909370422,
       "step": 15
     },
     {
-      "epoch": 1.3333333333333333,
-      "grad_norm": 1.2070649862289429,
-      "learning_rate": 1.5000000000000002e-05,
-      "loss": 0.515,
-      "mean_token_accuracy": 0.8140048146247864,
       "step": 20
     },
     {
-      "epoch": 1.6666666666666665,
-      "grad_norm": 1.0796136856079102,
-      "learning_rate": 1.1736481776669307e-05,
-      "loss": 0.508,
-      "mean_token_accuracy": 0.8207547307014466,
-      "step": 25
     },
     {
-      "epoch": 2.0,
-      "grad_norm": 1.3636045455932617,
-      "learning_rate": 8.263518223330698e-06,
-      "loss": 0.4524,
-      "mean_token_accuracy": 0.8510397672653198,
-      "step": 30
     },
     {
-      "epoch": 2.3333333333333335,
-      "grad_norm": 0.7634139060974121,
-      "learning_rate": 5.000000000000003e-06,
-      "loss": 0.3877,
-      "mean_token_accuracy": 0.8616405725479126,
-      "step": 35
     },
     {
-      "epoch": 2.6666666666666665,
-      "grad_norm": 0.9971669316291809,
-      "learning_rate": 2.339555568810221e-06,
-      "loss": 0.359,
-      "mean_token_accuracy": 0.8703584909439087,
-      "step": 40
     },
     {
-      "epoch": 3.0,
-      "grad_norm": 0.9843636751174927,
-      "learning_rate": 6.030737921409169e-07,
-      "loss": 0.4023,
-      "mean_token_accuracy": 0.8350511193275452,
-      "step": 45
     },
     {
-      "epoch": 3.3333333333333335,
-      "grad_norm": 0.7454453706741333,
       "learning_rate": 0.0,
-      "loss": 0.3313,
-      "mean_token_accuracy": 0.8710624217987061,
-      "step": 50
-    },
-    {
-      "epoch": 3.3333333333333335,
-      "step": 50,
-      "total_flos": 3571493896192.0,
-      "train_loss": 0.5901590967178345,
-      "train_runtime": 1292.5059,
-      "train_samples_per_second": 0.309,
-      "train_steps_per_second": 0.039
     }
   ],
-  "logging_steps": 5,
-  "max_steps": 50,
   "num_input_tokens_seen": 0,
-  "num_train_epochs": 4,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
@@ -115,7 +267,7 @@
       "attributes": {}
     }
   },
-  "total_flos": 3571493896192.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

 {
   "best_metric": null,
   "best_model_checkpoint": null,
+  "epoch": 1.0,
   "eval_steps": 500,
+  "global_step": 29,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
+      "epoch": 0.034482758620689655,
+      "grad_norm": 4.495543003082275,
+      "learning_rate": 6.666666666666667e-06,
+      "loss": 1.6484,
+      "mean_token_accuracy": 0.6393418908119202,
+      "step": 1
+    },
+    {
+      "epoch": 0.06896551724137931,
+      "grad_norm": 5.30371618270874,
+      "learning_rate": 1.3333333333333333e-05,
+      "loss": 1.7034,
+      "mean_token_accuracy": 0.6935998201370239,
+      "step": 2
+    },
+    {
+      "epoch": 0.10344827586206896,
+      "grad_norm": 4.937737941741943,
       "learning_rate": 2e-05,
+      "loss": 1.6917,
+      "mean_token_accuracy": 0.6422624588012695,
+      "step": 3
+    },
+    {
+      "epoch": 0.13793103448275862,
+      "grad_norm": 3.243507146835327,
+      "learning_rate": 1.992708874098054e-05,
+      "loss": 1.2509,
+      "mean_token_accuracy": 0.7191780805587769,
+      "step": 4
+    },
+    {
+      "epoch": 0.1724137931034483,
+      "grad_norm": 2.8890597820281982,
+      "learning_rate": 1.9709418174260523e-05,
+      "loss": 1.0261,
+      "mean_token_accuracy": 0.7499054074287415,
       "step": 5
     },
     {
+      "epoch": 0.20689655172413793,
+      "grad_norm": 2.3275909423828125,
+      "learning_rate": 1.9350162426854152e-05,
+      "loss": 1.0115,
+      "mean_token_accuracy": 0.7381644248962402,
+      "step": 6
+    },
+    {
+      "epoch": 0.2413793103448276,
+      "grad_norm": 2.2428598403930664,
+      "learning_rate": 1.8854560256532098e-05,
+      "loss": 1.0019,
+      "mean_token_accuracy": 0.7424749135971069,
+      "step": 7
+    },
+    {
+      "epoch": 0.27586206896551724,
+      "grad_norm": 2.0306403636932373,
+      "learning_rate": 1.8229838658936566e-05,
+      "loss": 0.8716,
+      "mean_token_accuracy": 0.7403547167778015,
+      "step": 8
+    },
+    {
+      "epoch": 0.3103448275862069,
+      "grad_norm": 2.3220930099487305,
+      "learning_rate": 1.7485107481711014e-05,
+      "loss": 0.9914,
+      "mean_token_accuracy": 0.7194558382034302,
+      "step": 9
+    },
+    {
+      "epoch": 0.3448275862068966,
+      "grad_norm": 1.9385402202606201,
+      "learning_rate": 1.6631226582407954e-05,
+      "loss": 0.8769,
+      "mean_token_accuracy": 0.7493368983268738,
       "step": 10
     },
     {
+      "epoch": 0.3793103448275862,
+      "grad_norm": 1.6588478088378906,
+      "learning_rate": 1.568064746731156e-05,
+      "loss": 0.7935,
+      "mean_token_accuracy": 0.7707903981208801,
+      "step": 11
+    },
+    {
+      "epoch": 0.41379310344827586,
+      "grad_norm": 1.5337258577346802,
+      "learning_rate": 1.4647231720437687e-05,
+      "loss": 0.5609,
+      "mean_token_accuracy": 0.8345285058021545,
+      "step": 12
+    },
+    {
+      "epoch": 0.4482758620689655,
+      "grad_norm": 1.4156630039215088,
+      "learning_rate": 1.3546048870425356e-05,
+      "loss": 0.7261,
+      "mean_token_accuracy": 0.7816407084465027,
+      "step": 13
+    },
+    {
+      "epoch": 0.4827586206896552,
+      "grad_norm": 2.230161666870117,
+      "learning_rate": 1.2393156642875579e-05,
+      "loss": 0.8858,
+      "mean_token_accuracy": 0.7566941380500793,
+      "step": 14
+    },
+    {
+      "epoch": 0.5172413793103449,
+      "grad_norm": 1.8028125762939453,
+      "learning_rate": 1.1205366802553231e-05,
+      "loss": 1.1158,
+      "mean_token_accuracy": 0.5864984393119812,
       "step": 15
     },
     {
+      "epoch": 0.5517241379310345,
+      "grad_norm": 1.6085702180862427,
+      "learning_rate": 1e-05,
+      "loss": 0.8757,
+      "mean_token_accuracy": 0.6398804783821106,
+      "step": 16
+    },
+    {
+      "epoch": 0.5862068965517241,
+      "grad_norm": 1.536893367767334,
+      "learning_rate": 8.79463319744677e-06,
+      "loss": 0.5636,
+      "mean_token_accuracy": 0.8361495137214661,
+      "step": 17
+    },
+    {
+      "epoch": 0.6206896551724138,
+      "grad_norm": 1.5602803230285645,
+      "learning_rate": 7.606843357124426e-06,
+      "loss": 0.5936,
+      "mean_token_accuracy": 0.8189575672149658,
+      "step": 18
+    },
+    {
+      "epoch": 0.6551724137931034,
+      "grad_norm": 2.647397518157959,
+      "learning_rate": 6.453951129574644e-06,
+      "loss": 0.9382,
+      "mean_token_accuracy": 0.7290132641792297,
+      "step": 19
+    },
+    {
+      "epoch": 0.6896551724137931,
+      "grad_norm": 2.4632463455200195,
+      "learning_rate": 5.352768279562315e-06,
+      "loss": 0.9069,
+      "mean_token_accuracy": 0.761885404586792,
       "step": 20
     },
     {
+      "epoch": 0.7241379310344828,
+      "grad_norm": 3.2052621841430664,
+      "learning_rate": 4.319352532688444e-06,
+      "loss": 0.7117,
+      "mean_token_accuracy": 0.7920604944229126,
+      "step": 21
     },
     {
+      "epoch": 0.7586206896551724,
+      "grad_norm": 3.087772846221924,
+      "learning_rate": 3.3687734175920505e-06,
+      "loss": 0.6292,
+      "mean_token_accuracy": 0.8239715099334717,
+      "step": 22
     },
     {
+      "epoch": 0.7931034482758621,
+      "grad_norm": 1.6388134956359863,
+      "learning_rate": 2.514892518288988e-06,
+      "loss": 0.9746,
+      "mean_token_accuracy": 0.6847044825553894,
+      "step": 23
     },
     {
+      "epoch": 0.8275862068965517,
+      "grad_norm": 1.2204201221466064,
+      "learning_rate": 1.7701613410634367e-06,
+      "loss": 0.6487,
+      "mean_token_accuracy": 0.7619174718856812,
+      "step": 24
     },
     {
+      "epoch": 0.8620689655172413,
+      "grad_norm": 1.7940735816955566,
+      "learning_rate": 1.1454397434679022e-06,
+      "loss": 0.9551,
+      "mean_token_accuracy": 0.6685531735420227,
+      "step": 25
+    },
+    {
+      "epoch": 0.896551724137931,
+      "grad_norm": 1.6773293018341064,
+      "learning_rate": 6.498375731458529e-07,
+      "loss": 0.6681,
+      "mean_token_accuracy": 0.8169485330581665,
+      "step": 26
     },
     {
+      "epoch": 0.9310344827586207,
+      "grad_norm": 1.4842838048934937,
+      "learning_rate": 2.905818257394799e-07,
+      "loss": 0.5996,
+      "mean_token_accuracy": 0.8048691749572754,
+      "step": 27
+    },
+    {
+      "epoch": 0.9655172413793104,
+      "grad_norm": 1.9699349403381348,
+      "learning_rate": 7.291125901946027e-08,
+      "loss": 0.9032,
+      "mean_token_accuracy": 0.7233096361160278,
+      "step": 28
+    },
+    {
+      "epoch": 1.0,
+      "grad_norm": 1.4653249979019165,
       "learning_rate": 0.0,
+      "loss": 0.6289,
+      "mean_token_accuracy": 0.7883654236793518,
+      "step": 29
+    },
+    {
+      "epoch": 1.0,
+      "step": 29,
+      "total_flos": 997309579264.0,
+      "train_loss": 0.9225217268384737,
+      "train_runtime": 1196.3929,
+      "train_samples_per_second": 0.095,
+      "train_steps_per_second": 0.024
     }
   ],
+  "logging_steps": 1,
+  "max_steps": 29,
   "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
   "save_steps": 500,
   "stateful_callbacks": {
     "TrainerControl": {
       "attributes": {}
     }
   },
+  "total_flos": 997309579264.0,
   "train_batch_size": 1,
   "trial_name": null,
   "trial_params": null

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:be5a4ae8ae971abeee85c945e74a2cb9f97a38b54a3dc0b3862017acc587b1d5
 size 7800

 version https://git-lfs.github.com/spec/v1
+oid sha256:316bd20663017623357bdc27e97c2b810728c9afe43979e3bd3390b148541b97
 size 7800