moyixiao commited on
Commit
34c15aa
·
verified ·
1 Parent(s): d1b3a71

Training in progress, step 80, checkpoint

Browse files
checkpoint-80/README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: Qwen/Qwen2.5-1.5B
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: Qwen/Qwen3-0.6B-Base
3
  library_name: peft
4
  ---
5
 
checkpoint-80/adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "Qwen/Qwen2.5-1.5B",
5
  "bias": "none",
6
  "eva_config": null,
7
  "exclude_modules": null,
@@ -23,13 +23,13 @@
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
- "k_proj",
27
  "v_proj",
28
  "gate_proj",
29
- "up_proj",
30
  "down_proj",
31
  "o_proj",
32
- "q_proj"
 
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen3-0.6B-Base",
5
  "bias": "none",
6
  "eva_config": null,
7
  "exclude_modules": null,
 
23
  "rank_pattern": {},
24
  "revision": null,
25
  "target_modules": [
26
+ "q_proj",
27
  "v_proj",
28
  "gate_proj",
 
29
  "down_proj",
30
  "o_proj",
31
+ "k_proj",
32
+ "up_proj"
33
  ],
34
  "task_type": "CAUSAL_LM",
35
  "use_dora": false,
checkpoint-80/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a2171570d9bd7dd751f84ffc063a29d16bb985b6da7b54fd5791ffcf7f631c67
3
- size 147770496
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:704aa895f2f07026040c80aabf199a59a516b3812a5f621b843a6d90d51efef1
3
+ size 80792456
checkpoint-80/added_tokens.json CHANGED
@@ -1,6 +1,10 @@
1
  {
 
2
  "</tool_call>": 151658,
 
 
3
  "<tool_call>": 151657,
 
4
  "<|box_end|>": 151649,
5
  "<|box_start|>": 151648,
6
  "<|endoftext|>": 151643,
 
1
  {
2
+ "</think>": 151668,
3
  "</tool_call>": 151658,
4
+ "</tool_response>": 151666,
5
+ "<think>": 151667,
6
  "<tool_call>": 151657,
7
+ "<tool_response>": 151665,
8
  "<|box_end|>": 151649,
9
  "<|box_start|>": 151648,
10
  "<|endoftext|>": 151643,
checkpoint-80/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6c144d6a2e0021236496a3f677b3883ef3f1949de5af7aba3102df0fa925d563
3
- size 295765866
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16cead893f86861710de22e991845205a6d4b679b7565f4eaec58cde24be7b9e
3
+ size 161810474
checkpoint-80/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:aae4fbbc5ce9ed472f995338a1195c12ba1eb2aa7dc11ee2ebdfebb350e98349
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a29189bdfdfa8f4556c761fea884eb42e375a1a4105f9506a8b8d7fa7ff34f8
3
  size 1064
checkpoint-80/tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
- size 11421896
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aeb13307a71acd8fe81861d94ad54ab689df773318809eed3cbe794b4492dae4
3
+ size 11422654
checkpoint-80/tokenizer_config.json CHANGED
@@ -177,6 +177,38 @@
177
  "rstrip": false,
178
  "single_word": false,
179
  "special": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
180
  }
181
  },
182
  "additional_special_tokens": [
 
177
  "rstrip": false,
178
  "single_word": false,
179
  "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<tool_response>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": false
188
+ },
189
+ "151666": {
190
+ "content": "</tool_response>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": false
196
+ },
197
+ "151667": {
198
+ "content": "<think>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": false
204
+ },
205
+ "151668": {
206
+ "content": "</think>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": false
212
  }
213
  },
214
  "additional_special_tokens": [
checkpoint-80/trainer_state.json CHANGED
@@ -2,7 +2,7 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.09397944199706314,
6
  "eval_steps": 500,
7
  "global_step": 80,
8
  "is_hyper_param_search": false,
@@ -10,67 +10,67 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.011747430249632892,
14
- "grad_norm": 1.5699902772903442,
15
- "learning_rate": 0.00027,
16
- "loss": 3.0983,
17
  "step": 10
18
  },
19
  {
20
- "epoch": 0.023494860499265784,
21
- "grad_norm": 1.6029695272445679,
22
- "learning_rate": 0.00029991523567092526,
23
- "loss": 2.062,
24
  "step": 20
25
  },
26
  {
27
- "epoch": 0.03524229074889868,
28
- "grad_norm": 1.593436360359192,
29
- "learning_rate": 0.00029962234616583063,
30
- "loss": 1.2074,
31
  "step": 30
32
  },
33
  {
34
- "epoch": 0.04698972099853157,
35
- "grad_norm": 0.5851414799690247,
36
- "learning_rate": 0.00029912069357315393,
37
- "loss": 0.888,
38
  "step": 40
39
  },
40
  {
41
- "epoch": 0.05873715124816446,
42
- "grad_norm": 0.25992292165756226,
43
- "learning_rate": 0.0002984109778320875,
44
- "loss": 0.7685,
45
  "step": 50
46
  },
47
  {
48
- "epoch": 0.07048458149779736,
49
- "grad_norm": 0.21082307398319244,
50
- "learning_rate": 0.00029749418918542057,
51
- "loss": 0.7096,
52
  "step": 60
53
  },
54
  {
55
- "epoch": 0.08223201174743025,
56
- "grad_norm": 0.16843102872371674,
57
- "learning_rate": 0.0002963716067978866,
58
- "loss": 0.6901,
59
  "step": 70
60
  },
61
  {
62
- "epoch": 0.09397944199706314,
63
- "grad_norm": 0.12076722830533981,
64
- "learning_rate": 0.000295044796971387,
65
- "loss": 0.6702,
66
  "step": 80
67
  }
68
  ],
69
  "logging_steps": 10,
70
- "max_steps": 851,
71
  "num_input_tokens_seen": 0,
72
- "num_train_epochs": 1,
73
- "save_steps": 40,
74
  "stateful_callbacks": {
75
  "TrainerControl": {
76
  "args": {
@@ -83,8 +83,8 @@
83
  "attributes": {}
84
  }
85
  },
86
- "total_flos": 1.69525812326826e+17,
87
- "train_batch_size": 2,
88
  "trial_name": null,
89
  "trial_params": null
90
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.18812463256907702,
6
  "eval_steps": 500,
7
  "global_step": 80,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.023515579071134628,
14
+ "grad_norm": 2.4936752319335938,
15
+ "learning_rate": 6.75e-05,
16
+ "loss": 3.3727,
17
  "step": 10
18
  },
19
  {
20
+ "epoch": 0.047031158142269255,
21
+ "grad_norm": 1.268250823020935,
22
+ "learning_rate": 0.0001425,
23
+ "loss": 2.4747,
24
  "step": 20
25
  },
26
  {
27
+ "epoch": 0.07054673721340388,
28
+ "grad_norm": 0.7520093321800232,
29
+ "learning_rate": 0.00021749999999999997,
30
+ "loss": 1.6981,
31
  "step": 30
32
  },
33
  {
34
+ "epoch": 0.09406231628453851,
35
+ "grad_norm": 0.7805754542350769,
36
+ "learning_rate": 0.00029249999999999995,
37
+ "loss": 1.1544,
38
  "step": 40
39
  },
40
  {
41
+ "epoch": 0.11757789535567313,
42
+ "grad_norm": 0.3181486427783966,
43
+ "learning_rate": 0.0002999904831331692,
44
+ "loss": 0.889,
45
  "step": 50
46
  },
47
  {
48
+ "epoch": 0.14109347442680775,
49
+ "grad_norm": 0.1686788648366928,
50
+ "learning_rate": 0.00029995758687228834,
51
+ "loss": 0.765,
52
  "step": 60
53
  },
54
  {
55
+ "epoch": 0.1646090534979424,
56
+ "grad_norm": 0.16962774097919464,
57
+ "learning_rate": 0.000299901198877339,
58
+ "loss": 0.7138,
59
  "step": 70
60
  },
61
  {
62
+ "epoch": 0.18812463256907702,
63
+ "grad_norm": 0.1931052953004837,
64
+ "learning_rate": 0.0002998213279818309,
65
+ "loss": 0.6962,
66
  "step": 80
67
  }
68
  ],
69
  "logging_steps": 10,
70
+ "max_steps": 2550,
71
  "num_input_tokens_seen": 0,
72
+ "num_train_epochs": 6,
73
+ "save_steps": 80,
74
  "stateful_callbacks": {
75
  "TrainerControl": {
76
  "args": {
 
83
  "attributes": {}
84
  }
85
  },
86
+ "total_flos": 1.1592700847456256e+17,
87
+ "train_batch_size": 4,
88
  "trial_name": null,
89
  "trial_params": null
90
  }
checkpoint-80/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c61b7597101ee42620ee3c35211c8aaa2dd701dc84ac03219baa5f43c59a2eff
3
  size 5688
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db4ff7073a4da3309636bbb72b82f1473dd3f796afe5e7d3e7687cadcf17ca0f
3
  size 5688