adamhao123 commited on
Commit
8936f39
·
verified ·
1 Parent(s): b990bc1

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
3
- datasets: jdy_analysis
4
  library_name: transformers
5
  model_name: Qwen2.5-32B-Open-R1-Distill-jdy-ft
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - sft
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen2.5-32B-Open-R1-Distill-jdy-ft
15
 
16
- This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B) on the [jdy_analysis](https://huggingface.co/datasets/jdy_analysis) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/songhao9021-9uest/huggingface/runs/p45jgv2a)
33
 
34
 
35
  This model was trained with SFT.
@@ -39,7 +37,7 @@ This model was trained with SFT.
39
  - TRL: 0.16.0.dev0
40
  - Transformers: 4.49.0
41
  - Pytorch: 2.5.1
42
- - Datasets: 3.3.2
43
  - Tokenizers: 0.21.1
44
 
45
  ## Citations
 
1
  ---
2
  base_model: deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
 
3
  library_name: transformers
4
  model_name: Qwen2.5-32B-Open-R1-Distill-jdy-ft
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - sft
9
  licence: license
 
11
 
12
  # Model Card for Qwen2.5-32B-Open-R1-Distill-jdy-ft
13
 
14
+ This model is a fine-tuned version of [deepseek-ai/DeepSeek-R1-Distill-Qwen-32B](https://huggingface.co/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/songhao9021-9uest/huggingface/runs/0kc006oz)
31
 
32
 
33
  This model was trained with SFT.
 
37
  - TRL: 0.16.0.dev0
38
  - Transformers: 4.49.0
39
  - Pytorch: 2.5.1
40
+ - Datasets: 3.4.1
41
  - Tokenizers: 0.21.1
42
 
43
  ## Citations
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 3571493896192.0,
3
- "train_loss": 0.5901590967178345,
4
- "train_runtime": 1292.5059,
5
  "train_samples": 114,
6
- "train_samples_per_second": 0.309,
7
- "train_steps_per_second": 0.039
8
  }
 
1
  {
2
+ "total_flos": 997309579264.0,
3
+ "train_loss": 0.9225217268384737,
4
+ "train_runtime": 1196.3929,
5
  "train_samples": 114,
6
+ "train_samples_per_second": 0.095,
7
+ "train_steps_per_second": 0.024
8
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
- "use_cache": true,
27
  "use_sliding_window": false,
28
  "vocab_size": 152064
29
  }
 
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
+ "use_cache": false,
27
  "use_sliding_window": false,
28
  "vocab_size": 152064
29
  }
model-00001-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a9059ef10fa47f6e646117d9e0fa6d5a3d7d7290d9d9d7e364bf5cd3a96958df
3
  size 4891730992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30ea6fd81c1f88660745f93421cffd7c6785cd6c5f47fa55894dd3435a6b193a
3
  size 4891730992
model-00002-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2c8ee020ef4ea66dcce6d93396885ac25df82e7749fc203f08d71fbd39e964df
3
  size 4876059352
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f4735781f9bd6d2d8a3948981b86c8c29fe0481702083fca101f786e9aa76580
3
  size 4876059352
model-00003-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52041eeb8f35d6a5c438ea2409cb484140125eae36b6e2cb96d437e9672b4923
3
  size 4876059384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ca17dd8c94a958cb777f9ed5589d26f233e22453c64175ce4c025053b44f7dd5
3
  size 4876059384
model-00004-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:10d0cc3074579747ac679e13dac922d5aeafc797f0097e8b3b447ffe63493fb5
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:240ac171f8933770890036a10dac990dbd16e94ce578e3114e1d3eb3f3a55fa6
3
  size 4876059416
model-00005-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4ffed755b52730674b65e8acbe7e103b8a63028a28387b1decdc40987ec9fd00
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0084f3902b8b661d77c6a40225d343c17e7c89e0c07c31a7cea35f3d598c0185
3
  size 4876059416
model-00006-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46319c08fe3c2ac91cdf0bb7dc3305aa378d234d23a4c4ac95423d6b8bee239d
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36eb65d0d2707a13130d49600c62fde3fc2571714ff9fab91c196b95c99dd1c3
3
  size 4876059416
model-00007-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7e6a659dd9e3ea7a2ea85748bbb7f7d3fb3eaa382a5b331a62b0ccf0d63012f5
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:001a2b0fc931a4390ff9495dc64f1b8cd1ab2969f6d9ba5a49499e9a4313ce58
3
  size 4876059416
model-00008-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d5a5c0288d94f24c0119817b347f08cb5d8c0fe03537691e2d4af221888cbcdb
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cbf1c05bdb70a57582cf930ce7edd7018358b9e8ceaac8c907563a4ff560a815
3
  size 4876059416
model-00009-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1738e2a2d84ce42c5d251acb86e02686438afdb8ec1dde76db7775039ed975a3
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2bea6d8bf986db1b7caf020811eb2711baed06a6d7446dd4da23206aeb1e51ea
3
  size 4876059416
model-00010-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3a62e7006b2e603080f3a78d2ce00ea9e3041e3ad69a7248673bd5cda146a43b
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2612f21e4860e9ff8da23505d285d3da64db2fbf28080b1c0e22333a6482b7ac
3
  size 4876059416
model-00011-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee821fa52519c75873198473bc0fadd6c7a7e128e8c9630cf77d0a4920c7c2b8
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7c341b6c5aea237353690fd93289404f21f82e46622bdf50d183043cca1e2c9
3
  size 4876059416
model-00012-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed2c6fb963768d0c0765b872624c79b9297111f860f566e149d61f663fd9e249
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:82a086a48be333c5275f10f0d69422c95864a5e220778fc61711637c1f9eb5e6
3
  size 4876059416
model-00013-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:46db25eae00cae2295c5afee70396ee5e92b4563e1cf03af8f5bf2e1de6a1d49
3
  size 4876059416
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e03c63e8ddf7933571d87d012d2cd05a10969f649efa5bcbe031533ca1983fa
3
  size 4876059416
model-00014-of-00014.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32a580d78e40b56ca00678a2af9fbf222e6c088f9538a69a361bf31915653c70
3
  size 2123397800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2820cb0c9a9ba0d59f27aef52e334a4f579f51a6fbeebee077fcb06ac9801df9
3
  size 2123397800
tokenizer_config.json CHANGED
@@ -181,7 +181,7 @@
181
  }
182
  },
183
  "bos_token": "<|begin▁of▁sentence|>",
184
- "chat_template": "\n{% for message in messages %}\n {% if message['role'] == 'system' %}\n<|im_start|>system\n{{ message['content'] }}<|im_end|>\n {% else %}\n<|im_start|>{{ message['role'] }}\n{{ message['content'] }}<|im_end|>\n {% endif %}\n{% endfor %}\n",
185
  "clean_up_tokenization_spaces": false,
186
  "eos_token": "<|end▁of▁sentence|>",
187
  "extra_special_tokens": {},
 
181
  }
182
  },
183
  "bos_token": "<|begin▁of▁sentence|>",
184
+ "chat_template": "\n{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='') %}{%- for message in messages %}{%- if message['role'] == 'system' %}{% set ns.system_prompt = message['content'] %}{%- endif %}{%- endfor %}{{bos_token}}{{ns.system_prompt}}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is none %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls']%}{%- if not ns.is_first %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- endfor %}{%- endif %}{%- if message['role'] == 'assistant' and message['content'] is not none %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'\\n<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}\n",
185
  "clean_up_tokenization_spaces": false,
186
  "eos_token": "<|end▁of▁sentence|>",
187
  "extra_special_tokens": {},
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "total_flos": 3571493896192.0,
3
- "train_loss": 0.5901590967178345,
4
- "train_runtime": 1292.5059,
5
  "train_samples": 114,
6
- "train_samples_per_second": 0.309,
7
- "train_steps_per_second": 0.039
8
  }
 
1
  {
2
+ "total_flos": 997309579264.0,
3
+ "train_loss": 0.9225217268384737,
4
+ "train_runtime": 1196.3929,
5
  "train_samples": 114,
6
+ "train_samples_per_second": 0.095,
7
+ "train_steps_per_second": 0.024
8
  }
trainer_state.json CHANGED
@@ -1,107 +1,259 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 3.3333333333333335,
5
  "eval_steps": 500,
6
- "global_step": 50,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.3333333333333333,
13
- "grad_norm": 2.3431689739227295,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  "learning_rate": 2e-05,
15
- "loss": 1.4244,
16
- "mean_token_accuracy": 0.6533483505249024,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  "step": 5
18
  },
19
  {
20
- "epoch": 0.6666666666666666,
21
- "grad_norm": 1.3162345886230469,
22
- "learning_rate": 1.9396926207859085e-05,
23
- "loss": 0.8059,
24
- "mean_token_accuracy": 0.7710338115692139,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  "step": 10
26
  },
27
  {
28
- "epoch": 1.0,
29
- "grad_norm": 1.3825139999389648,
30
- "learning_rate": 1.766044443118978e-05,
31
- "loss": 0.7155,
32
- "mean_token_accuracy": 0.7817506909370422,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  "step": 15
34
  },
35
  {
36
- "epoch": 1.3333333333333333,
37
- "grad_norm": 1.2070649862289429,
38
- "learning_rate": 1.5000000000000002e-05,
39
- "loss": 0.515,
40
- "mean_token_accuracy": 0.8140048146247864,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  "step": 20
42
  },
43
  {
44
- "epoch": 1.6666666666666665,
45
- "grad_norm": 1.0796136856079102,
46
- "learning_rate": 1.1736481776669307e-05,
47
- "loss": 0.508,
48
- "mean_token_accuracy": 0.8207547307014466,
49
- "step": 25
50
  },
51
  {
52
- "epoch": 2.0,
53
- "grad_norm": 1.3636045455932617,
54
- "learning_rate": 8.263518223330698e-06,
55
- "loss": 0.4524,
56
- "mean_token_accuracy": 0.8510397672653198,
57
- "step": 30
58
  },
59
  {
60
- "epoch": 2.3333333333333335,
61
- "grad_norm": 0.7634139060974121,
62
- "learning_rate": 5.000000000000003e-06,
63
- "loss": 0.3877,
64
- "mean_token_accuracy": 0.8616405725479126,
65
- "step": 35
66
  },
67
  {
68
- "epoch": 2.6666666666666665,
69
- "grad_norm": 0.9971669316291809,
70
- "learning_rate": 2.339555568810221e-06,
71
- "loss": 0.359,
72
- "mean_token_accuracy": 0.8703584909439087,
73
- "step": 40
74
  },
75
  {
76
- "epoch": 3.0,
77
- "grad_norm": 0.9843636751174927,
78
- "learning_rate": 6.030737921409169e-07,
79
- "loss": 0.4023,
80
- "mean_token_accuracy": 0.8350511193275452,
81
- "step": 45
 
 
 
 
 
 
 
 
82
  },
83
  {
84
- "epoch": 3.3333333333333335,
85
- "grad_norm": 0.7454453706741333,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  "learning_rate": 0.0,
87
- "loss": 0.3313,
88
- "mean_token_accuracy": 0.8710624217987061,
89
- "step": 50
90
- },
91
- {
92
- "epoch": 3.3333333333333335,
93
- "step": 50,
94
- "total_flos": 3571493896192.0,
95
- "train_loss": 0.5901590967178345,
96
- "train_runtime": 1292.5059,
97
- "train_samples_per_second": 0.309,
98
- "train_steps_per_second": 0.039
99
  }
100
  ],
101
- "logging_steps": 5,
102
- "max_steps": 50,
103
  "num_input_tokens_seen": 0,
104
- "num_train_epochs": 4,
105
  "save_steps": 500,
106
  "stateful_callbacks": {
107
  "TrainerControl": {
@@ -115,7 +267,7 @@
115
  "attributes": {}
116
  }
117
  },
118
- "total_flos": 3571493896192.0,
119
  "train_batch_size": 1,
120
  "trial_name": null,
121
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
  "eval_steps": 500,
6
+ "global_step": 29,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.034482758620689655,
13
+ "grad_norm": 4.495543003082275,
14
+ "learning_rate": 6.666666666666667e-06,
15
+ "loss": 1.6484,
16
+ "mean_token_accuracy": 0.6393418908119202,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.06896551724137931,
21
+ "grad_norm": 5.30371618270874,
22
+ "learning_rate": 1.3333333333333333e-05,
23
+ "loss": 1.7034,
24
+ "mean_token_accuracy": 0.6935998201370239,
25
+ "step": 2
26
+ },
27
+ {
28
+ "epoch": 0.10344827586206896,
29
+ "grad_norm": 4.937737941741943,
30
  "learning_rate": 2e-05,
31
+ "loss": 1.6917,
32
+ "mean_token_accuracy": 0.6422624588012695,
33
+ "step": 3
34
+ },
35
+ {
36
+ "epoch": 0.13793103448275862,
37
+ "grad_norm": 3.243507146835327,
38
+ "learning_rate": 1.992708874098054e-05,
39
+ "loss": 1.2509,
40
+ "mean_token_accuracy": 0.7191780805587769,
41
+ "step": 4
42
+ },
43
+ {
44
+ "epoch": 0.1724137931034483,
45
+ "grad_norm": 2.8890597820281982,
46
+ "learning_rate": 1.9709418174260523e-05,
47
+ "loss": 1.0261,
48
+ "mean_token_accuracy": 0.7499054074287415,
49
  "step": 5
50
  },
51
  {
52
+ "epoch": 0.20689655172413793,
53
+ "grad_norm": 2.3275909423828125,
54
+ "learning_rate": 1.9350162426854152e-05,
55
+ "loss": 1.0115,
56
+ "mean_token_accuracy": 0.7381644248962402,
57
+ "step": 6
58
+ },
59
+ {
60
+ "epoch": 0.2413793103448276,
61
+ "grad_norm": 2.2428598403930664,
62
+ "learning_rate": 1.8854560256532098e-05,
63
+ "loss": 1.0019,
64
+ "mean_token_accuracy": 0.7424749135971069,
65
+ "step": 7
66
+ },
67
+ {
68
+ "epoch": 0.27586206896551724,
69
+ "grad_norm": 2.0306403636932373,
70
+ "learning_rate": 1.8229838658936566e-05,
71
+ "loss": 0.8716,
72
+ "mean_token_accuracy": 0.7403547167778015,
73
+ "step": 8
74
+ },
75
+ {
76
+ "epoch": 0.3103448275862069,
77
+ "grad_norm": 2.3220930099487305,
78
+ "learning_rate": 1.7485107481711014e-05,
79
+ "loss": 0.9914,
80
+ "mean_token_accuracy": 0.7194558382034302,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.3448275862068966,
85
+ "grad_norm": 1.9385402202606201,
86
+ "learning_rate": 1.6631226582407954e-05,
87
+ "loss": 0.8769,
88
+ "mean_token_accuracy": 0.7493368983268738,
89
  "step": 10
90
  },
91
  {
92
+ "epoch": 0.3793103448275862,
93
+ "grad_norm": 1.6588478088378906,
94
+ "learning_rate": 1.568064746731156e-05,
95
+ "loss": 0.7935,
96
+ "mean_token_accuracy": 0.7707903981208801,
97
+ "step": 11
98
+ },
99
+ {
100
+ "epoch": 0.41379310344827586,
101
+ "grad_norm": 1.5337258577346802,
102
+ "learning_rate": 1.4647231720437687e-05,
103
+ "loss": 0.5609,
104
+ "mean_token_accuracy": 0.8345285058021545,
105
+ "step": 12
106
+ },
107
+ {
108
+ "epoch": 0.4482758620689655,
109
+ "grad_norm": 1.4156630039215088,
110
+ "learning_rate": 1.3546048870425356e-05,
111
+ "loss": 0.7261,
112
+ "mean_token_accuracy": 0.7816407084465027,
113
+ "step": 13
114
+ },
115
+ {
116
+ "epoch": 0.4827586206896552,
117
+ "grad_norm": 2.230161666870117,
118
+ "learning_rate": 1.2393156642875579e-05,
119
+ "loss": 0.8858,
120
+ "mean_token_accuracy": 0.7566941380500793,
121
+ "step": 14
122
+ },
123
+ {
124
+ "epoch": 0.5172413793103449,
125
+ "grad_norm": 1.8028125762939453,
126
+ "learning_rate": 1.1205366802553231e-05,
127
+ "loss": 1.1158,
128
+ "mean_token_accuracy": 0.5864984393119812,
129
  "step": 15
130
  },
131
  {
132
+ "epoch": 0.5517241379310345,
133
+ "grad_norm": 1.6085702180862427,
134
+ "learning_rate": 1e-05,
135
+ "loss": 0.8757,
136
+ "mean_token_accuracy": 0.6398804783821106,
137
+ "step": 16
138
+ },
139
+ {
140
+ "epoch": 0.5862068965517241,
141
+ "grad_norm": 1.536893367767334,
142
+ "learning_rate": 8.79463319744677e-06,
143
+ "loss": 0.5636,
144
+ "mean_token_accuracy": 0.8361495137214661,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 0.6206896551724138,
149
+ "grad_norm": 1.5602803230285645,
150
+ "learning_rate": 7.606843357124426e-06,
151
+ "loss": 0.5936,
152
+ "mean_token_accuracy": 0.8189575672149658,
153
+ "step": 18
154
+ },
155
+ {
156
+ "epoch": 0.6551724137931034,
157
+ "grad_norm": 2.647397518157959,
158
+ "learning_rate": 6.453951129574644e-06,
159
+ "loss": 0.9382,
160
+ "mean_token_accuracy": 0.7290132641792297,
161
+ "step": 19
162
+ },
163
+ {
164
+ "epoch": 0.6896551724137931,
165
+ "grad_norm": 2.4632463455200195,
166
+ "learning_rate": 5.352768279562315e-06,
167
+ "loss": 0.9069,
168
+ "mean_token_accuracy": 0.761885404586792,
169
  "step": 20
170
  },
171
  {
172
+ "epoch": 0.7241379310344828,
173
+ "grad_norm": 3.2052621841430664,
174
+ "learning_rate": 4.319352532688444e-06,
175
+ "loss": 0.7117,
176
+ "mean_token_accuracy": 0.7920604944229126,
177
+ "step": 21
178
  },
179
  {
180
+ "epoch": 0.7586206896551724,
181
+ "grad_norm": 3.087772846221924,
182
+ "learning_rate": 3.3687734175920505e-06,
183
+ "loss": 0.6292,
184
+ "mean_token_accuracy": 0.8239715099334717,
185
+ "step": 22
186
  },
187
  {
188
+ "epoch": 0.7931034482758621,
189
+ "grad_norm": 1.6388134956359863,
190
+ "learning_rate": 2.514892518288988e-06,
191
+ "loss": 0.9746,
192
+ "mean_token_accuracy": 0.6847044825553894,
193
+ "step": 23
194
  },
195
  {
196
+ "epoch": 0.8275862068965517,
197
+ "grad_norm": 1.2204201221466064,
198
+ "learning_rate": 1.7701613410634367e-06,
199
+ "loss": 0.6487,
200
+ "mean_token_accuracy": 0.7619174718856812,
201
+ "step": 24
202
  },
203
  {
204
+ "epoch": 0.8620689655172413,
205
+ "grad_norm": 1.7940735816955566,
206
+ "learning_rate": 1.1454397434679022e-06,
207
+ "loss": 0.9551,
208
+ "mean_token_accuracy": 0.6685531735420227,
209
+ "step": 25
210
+ },
211
+ {
212
+ "epoch": 0.896551724137931,
213
+ "grad_norm": 1.6773293018341064,
214
+ "learning_rate": 6.498375731458529e-07,
215
+ "loss": 0.6681,
216
+ "mean_token_accuracy": 0.8169485330581665,
217
+ "step": 26
218
  },
219
  {
220
+ "epoch": 0.9310344827586207,
221
+ "grad_norm": 1.4842838048934937,
222
+ "learning_rate": 2.905818257394799e-07,
223
+ "loss": 0.5996,
224
+ "mean_token_accuracy": 0.8048691749572754,
225
+ "step": 27
226
+ },
227
+ {
228
+ "epoch": 0.9655172413793104,
229
+ "grad_norm": 1.9699349403381348,
230
+ "learning_rate": 7.291125901946027e-08,
231
+ "loss": 0.9032,
232
+ "mean_token_accuracy": 0.7233096361160278,
233
+ "step": 28
234
+ },
235
+ {
236
+ "epoch": 1.0,
237
+ "grad_norm": 1.4653249979019165,
238
  "learning_rate": 0.0,
239
+ "loss": 0.6289,
240
+ "mean_token_accuracy": 0.7883654236793518,
241
+ "step": 29
242
+ },
243
+ {
244
+ "epoch": 1.0,
245
+ "step": 29,
246
+ "total_flos": 997309579264.0,
247
+ "train_loss": 0.9225217268384737,
248
+ "train_runtime": 1196.3929,
249
+ "train_samples_per_second": 0.095,
250
+ "train_steps_per_second": 0.024
251
  }
252
  ],
253
+ "logging_steps": 1,
254
+ "max_steps": 29,
255
  "num_input_tokens_seen": 0,
256
+ "num_train_epochs": 1,
257
  "save_steps": 500,
258
  "stateful_callbacks": {
259
  "TrainerControl": {
 
267
  "attributes": {}
268
  }
269
  },
270
+ "total_flos": 997309579264.0,
271
  "train_batch_size": 1,
272
  "trial_name": null,
273
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:be5a4ae8ae971abeee85c945e74a2cb9f97a38b54a3dc0b3862017acc587b1d5
3
  size 7800
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:316bd20663017623357bdc27e97c2b810728c9afe43979e3bd3390b148541b97
3
  size 7800