Dongwei commited on
Commit
7c36af4
·
verified ·
1 Parent(s): 6692eb1

Model save

Browse files
README.md CHANGED
@@ -1,11 +1,9 @@
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-7B
3
- datasets: Dongwei/Math_8K_for_GRPO
4
  library_name: transformers
5
  model_name: Qwen-2.5-7B_Base_Math_smalllr_newdata
6
  tags:
7
  - generated_from_trainer
8
- - open-r1
9
  - trl
10
  - grpo
11
  licence: license
@@ -13,7 +11,7 @@ licence: license
13
 
14
  # Model Card for Qwen-2.5-7B_Base_Math_smalllr_newdata
15
 
16
- This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B) on the [Dongwei/Math_8K_for_GRPO](https://huggingface.co/datasets/Dongwei/Math_8K_for_GRPO) dataset.
17
  It has been trained using [TRL](https://github.com/huggingface/trl).
18
 
19
  ## Quick start
@@ -29,7 +27,7 @@ print(output["generated_text"])
29
 
30
  ## Training procedure
31
 
32
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/dongwei_jiang/huggingface/runs/kvh15moq)
33
 
34
 
35
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
  base_model: Qwen/Qwen2.5-Math-7B
 
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B_Base_Math_smalllr_newdata
5
  tags:
6
  - generated_from_trainer
 
7
  - trl
8
  - grpo
9
  licence: license
 
11
 
12
  # Model Card for Qwen-2.5-7B_Base_Math_smalllr_newdata
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/dongwei_jiang/huggingface/runs/652a79wz)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0001293725906606009,
4
- "train_runtime": 14407.4123,
5
  "train_samples": 8890,
6
- "train_samples_per_second": 0.617,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.00014615306474896194,
4
+ "train_runtime": 28501.5732,
5
  "train_samples": 8890,
6
+ "train_samples_per_second": 0.624,
7
+ "train_steps_per_second": 0.006
8
  }
config.json CHANGED
@@ -23,7 +23,7 @@
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
- "use_cache": true,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
29
  "vocab_size": 152064
 
23
  "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0.dev0",
26
+ "use_cache": false,
27
  "use_mrope": false,
28
  "use_sliding_window": false,
29
  "vocab_size": 152064
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8972c8502993056e5a729fd0afcc3e5615e006c6f6ae9c48e3c010c27911e217
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:187bfcc26ec49b526f3552d5beb693685c033886a74e6f90b9da0d792e3034bb
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4444bd3f19d52fe2eaaf2f4d572c48b0e45aeb1ee01712343968922920667cf1
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ae2f3bd6a1e3a6cbd063ea21ae30aa9c41adff47681ec50a0e9af5865fc89c14
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8f7ec89a2ae9bc3f162701488aec0c11f73dd9ab89ea4a1d8b13f98d0c955546
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:713b58386664b4c694580e30b72548c7693541e5856f477176032cdcaa75c50f
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6683f32c99805f719d04aea0c1aeb591f69bfcf3aebca5e4a901bb965246b1ca
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5cc3484c418f6b882c73d89ba5f3009b0e83924c78cd04d97d59b50ddb474d03
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.0001293725906606009,
4
- "train_runtime": 14407.4123,
5
  "train_samples": 8890,
6
- "train_samples_per_second": 0.617,
7
- "train_steps_per_second": 0.005
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.00014615306474896194,
4
+ "train_runtime": 28501.5732,
5
  "train_samples": 8890,
6
+ "train_samples_per_second": 0.624,
7
+ "train_steps_per_second": 0.006
8
  }
trainer_state.json CHANGED
@@ -1,124 +1,228 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9952755905511811,
5
  "eval_steps": 100,
6
- "global_step": 79,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 662.1981317520142,
13
  "epoch": 0.12598425196850394,
14
- "grad_norm": 0.34752824902534485,
15
- "kl": 0.00043702125549316406,
16
- "learning_rate": 2.994130233112417e-06,
17
  "loss": 0.0,
18
- "reward": 0.5890625271946192,
19
- "reward_std": 0.3153728174045682,
20
- "rewards/accuracy_reward": 0.5890625271946192,
21
- "rewards/format_reward": 0.0,
22
  "step": 10
23
  },
24
  {
25
- "completion_length": 687.3097408294677,
26
  "epoch": 0.25196850393700787,
27
- "grad_norm": 0.1284249722957611,
28
- "kl": 0.002486562728881836,
29
- "learning_rate": 2.7934718587800417e-06,
30
  "loss": 0.0001,
31
- "reward": 0.6207589566707611,
32
- "reward_std": 0.2419956461992115,
33
- "rewards/accuracy_reward": 0.6207589566707611,
34
  "rewards/format_reward": 0.0,
35
  "step": 20
36
  },
37
  {
38
- "completion_length": 640.5951181411743,
39
  "epoch": 0.3779527559055118,
40
- "grad_norm": 0.10701598972082138,
41
- "kl": 0.002702617645263672,
42
- "learning_rate": 2.343673931461171e-06,
43
  "loss": 0.0001,
44
- "reward": 0.6760044954717159,
45
- "reward_std": 0.21417916007339954,
46
- "rewards/accuracy_reward": 0.6760044954717159,
47
  "rewards/format_reward": 0.0,
48
  "step": 30
49
  },
50
  {
51
- "completion_length": 635.6863019943237,
52
  "epoch": 0.5039370078740157,
53
- "grad_norm": 0.07730241119861603,
54
- "kl": 0.003352832794189453,
55
- "learning_rate": 1.7313733994479534e-06,
56
  "loss": 0.0001,
57
- "reward": 0.7145089605823159,
58
- "reward_std": 0.19259586185216904,
59
- "rewards/accuracy_reward": 0.7145089605823159,
60
  "rewards/format_reward": 0.0,
61
  "step": 40
62
  },
63
  {
64
- "completion_length": 627.000700378418,
65
  "epoch": 0.6299212598425197,
66
- "grad_norm": 0.12998902797698975,
67
- "kl": 0.0035908699035644533,
68
- "learning_rate": 1.0745073324985549e-06,
69
  "loss": 0.0001,
70
- "reward": 0.7094866376370191,
71
- "reward_std": 0.19745821370743216,
72
- "rewards/accuracy_reward": 0.7094866376370191,
73
  "rewards/format_reward": 0.0,
74
  "step": 50
75
  },
76
  {
77
- "completion_length": 633.1238014221192,
78
  "epoch": 0.7559055118110236,
79
- "grad_norm": 0.2921755015850067,
80
- "kl": 0.0037825584411621095,
81
- "learning_rate": 4.995967037450238e-07,
82
  "loss": 0.0002,
83
- "reward": 0.6771205646917224,
84
- "reward_std": 0.2015925908461213,
85
- "rewards/accuracy_reward": 0.6771205646917224,
86
  "rewards/format_reward": 0.0,
87
  "step": 60
88
  },
89
  {
90
- "completion_length": 628.302260017395,
91
  "epoch": 0.8818897637795275,
92
- "grad_norm": 0.21092118322849274,
93
- "kl": 0.005428695678710937,
94
- "learning_rate": 1.1737679983668259e-07,
95
  "loss": 0.0002,
96
- "reward": 0.7151786023750901,
97
- "reward_std": 0.19608180108480155,
98
- "rewards/accuracy_reward": 0.7151786023750901,
99
  "rewards/format_reward": 0.0,
100
  "step": 70
101
  },
102
  {
103
- "completion_length": 609.5488855573866,
104
- "epoch": 0.9952755905511811,
105
- "kl": 0.0041395823160807295,
106
- "reward": 0.7307788046697775,
107
- "reward_std": 0.18725943296319908,
108
- "rewards/accuracy_reward": 0.7307788046697775,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  "rewards/format_reward": 0.0,
110
- "step": 79,
111
  "total_flos": 0.0,
112
- "train_loss": 0.0001293725906606009,
113
- "train_runtime": 14407.4123,
114
- "train_samples_per_second": 0.617,
115
- "train_steps_per_second": 0.005
116
  }
117
  ],
118
  "logging_steps": 10,
119
- "max_steps": 79,
120
  "num_input_tokens_seen": 0,
121
- "num_train_epochs": 1,
122
  "save_steps": 500,
123
  "stateful_callbacks": {
124
  "TrainerControl": {
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.9826771653543307,
5
  "eval_steps": 100,
6
+ "global_step": 158,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 652.5913221359253,
13
  "epoch": 0.12598425196850394,
14
+ "grad_norm": 0.5412344932556152,
15
+ "kl": 0.00025856494903564453,
16
+ "learning_rate": 1.875e-06,
17
  "loss": 0.0,
18
+ "reward": 0.5777902046218515,
19
+ "reward_std": 0.32899713758379223,
20
+ "rewards/accuracy_reward": 0.5776785971596837,
21
+ "rewards/format_reward": 0.00011160714784637093,
22
  "step": 10
23
  },
24
  {
25
+ "completion_length": 694.438868522644,
26
  "epoch": 0.25196850393700787,
27
+ "grad_norm": 0.24628566205501556,
28
+ "kl": 0.0019156813621520996,
29
+ "learning_rate": 2.994130233112417e-06,
30
  "loss": 0.0001,
31
+ "reward": 0.6052455639466643,
32
+ "reward_std": 0.26475782548077403,
33
+ "rewards/accuracy_reward": 0.6052455639466643,
34
  "rewards/format_reward": 0.0,
35
  "step": 20
36
  },
37
  {
38
+ "completion_length": 651.5066148757935,
39
  "epoch": 0.3779527559055118,
40
+ "grad_norm": 0.14223581552505493,
41
+ "kl": 0.0024756908416748045,
42
+ "learning_rate": 2.9286218000371976e-06,
43
  "loss": 0.0001,
44
+ "reward": 0.6724330654367805,
45
+ "reward_std": 0.23531078966334462,
46
+ "rewards/accuracy_reward": 0.6724330654367805,
47
  "rewards/format_reward": 0.0,
48
  "step": 30
49
  },
50
  {
51
+ "completion_length": 642.1838449478149,
52
  "epoch": 0.5039370078740157,
53
+ "grad_norm": 0.1239105761051178,
54
+ "kl": 0.0031515121459960937,
55
+ "learning_rate": 2.7934718587800417e-06,
56
  "loss": 0.0001,
57
+ "reward": 0.7046875322237611,
58
+ "reward_std": 0.19434297760017216,
59
+ "rewards/accuracy_reward": 0.7046875322237611,
60
  "rewards/format_reward": 0.0,
61
  "step": 40
62
  },
63
  {
64
+ "completion_length": 627.14924659729,
65
  "epoch": 0.6299212598425197,
66
+ "grad_norm": 0.13240313529968262,
67
+ "kl": 0.003639984130859375,
68
+ "learning_rate": 2.595268609058752e-06,
69
  "loss": 0.0001,
70
+ "reward": 0.7179687809199095,
71
+ "reward_std": 0.19313886840827763,
72
+ "rewards/accuracy_reward": 0.7179687809199095,
73
  "rewards/format_reward": 0.0,
74
  "step": 50
75
  },
76
  {
77
+ "completion_length": 626.9296024322509,
78
  "epoch": 0.7559055118110236,
79
+ "grad_norm": 0.15062075853347778,
80
+ "kl": 0.004168796539306641,
81
+ "learning_rate": 2.343673931461171e-06,
82
  "loss": 0.0002,
83
+ "reward": 0.6809152102097868,
84
+ "reward_std": 0.1983337783254683,
85
+ "rewards/accuracy_reward": 0.6809152102097868,
86
  "rewards/format_reward": 0.0,
87
  "step": 60
88
  },
89
  {
90
+ "completion_length": 610.840876197815,
91
  "epoch": 0.8818897637795275,
92
+ "grad_norm": 0.11126791685819626,
93
+ "kl": 0.004203128814697266,
94
+ "learning_rate": 2.0509523964971355e-06,
95
  "loss": 0.0002,
96
+ "reward": 0.7165178887546062,
97
+ "reward_std": 0.1934912689961493,
98
+ "rewards/accuracy_reward": 0.7165178887546062,
99
  "rewards/format_reward": 0.0,
100
  "step": 70
101
  },
102
  {
103
+ "completion_length": 592.2336000569661,
104
+ "epoch": 1.0,
105
+ "grad_norm": 0.17808477580547333,
106
+ "kl": 0.0042411295572916665,
107
+ "learning_rate": 1.7313733994479534e-06,
108
+ "loss": 0.0002,
109
+ "reward": 0.7291666994492213,
110
+ "reward_std": 0.1846819964547952,
111
+ "rewards/accuracy_reward": 0.7291666994492213,
112
+ "rewards/format_reward": 0.0,
113
+ "step": 80
114
+ },
115
+ {
116
+ "completion_length": 609.6807176589966,
117
+ "epoch": 1.125984251968504,
118
+ "grad_norm": 0.08229045569896698,
119
+ "kl": 0.004135942459106446,
120
+ "learning_rate": 1.4005155653473445e-06,
121
+ "loss": 0.0002,
122
+ "reward": 0.7156250355765224,
123
+ "reward_std": 0.20761510250158607,
124
+ "rewards/accuracy_reward": 0.7156250355765224,
125
+ "rewards/format_reward": 0.0,
126
+ "step": 90
127
+ },
128
+ {
129
+ "completion_length": 611.2092897415162,
130
+ "epoch": 1.2519685039370079,
131
+ "grad_norm": 0.16556662321090698,
132
+ "kl": 0.0037270545959472655,
133
+ "learning_rate": 1.0745073324985549e-06,
134
+ "loss": 0.0001,
135
+ "reward": 0.7110491398721933,
136
+ "reward_std": 0.18295098417438566,
137
+ "rewards/accuracy_reward": 0.7110491398721933,
138
+ "rewards/format_reward": 0.0,
139
+ "step": 100
140
+ },
141
+ {
142
+ "completion_length": 606.3881959915161,
143
+ "epoch": 1.3779527559055118,
144
+ "grad_norm": 0.09432197362184525,
145
+ "kl": 0.0037145614624023438,
146
+ "learning_rate": 7.692407340588056e-07,
147
+ "loss": 0.0001,
148
+ "reward": 0.7323661027476192,
149
+ "reward_std": 0.1929833421483636,
150
+ "rewards/accuracy_reward": 0.7323661027476192,
151
+ "rewards/format_reward": 0.0,
152
+ "step": 110
153
+ },
154
+ {
155
+ "completion_length": 604.5668788909912,
156
+ "epoch": 1.5039370078740157,
157
+ "grad_norm": 0.38694441318511963,
158
+ "kl": 0.004090404510498047,
159
+ "learning_rate": 4.995967037450238e-07,
160
+ "loss": 0.0002,
161
+ "reward": 0.7164062798023224,
162
+ "reward_std": 0.18084403886459768,
163
+ "rewards/accuracy_reward": 0.7164062798023224,
164
+ "rewards/format_reward": 0.0,
165
+ "step": 120
166
+ },
167
+ {
168
+ "completion_length": 606.2777070999146,
169
+ "epoch": 1.6299212598425197,
170
+ "grad_norm": 0.15648125112056732,
171
+ "kl": 0.0037802696228027345,
172
+ "learning_rate": 2.787196699446389e-07,
173
+ "loss": 0.0002,
174
+ "reward": 0.7242187837138772,
175
+ "reward_std": 0.19052648572251202,
176
+ "rewards/accuracy_reward": 0.7242187837138772,
177
+ "rewards/format_reward": 0.0,
178
+ "step": 130
179
+ },
180
+ {
181
+ "completion_length": 605.7184408187866,
182
+ "epoch": 1.7559055118110236,
183
+ "grad_norm": 0.4628942608833313,
184
+ "kl": 0.003756284713745117,
185
+ "learning_rate": 1.1737679983668259e-07,
186
+ "loss": 0.0002,
187
+ "reward": 0.7152902094647289,
188
+ "reward_std": 0.20197481904178857,
189
+ "rewards/accuracy_reward": 0.7152902094647289,
190
+ "rewards/format_reward": 0.0,
191
+ "step": 140
192
+ },
193
+ {
194
+ "completion_length": 605.061745262146,
195
+ "epoch": 1.8818897637795275,
196
+ "grad_norm": 0.1207461878657341,
197
+ "kl": 0.007715559005737305,
198
+ "learning_rate": 2.343312866591163e-08,
199
+ "loss": 0.0003,
200
+ "reward": 0.7013393187895417,
201
+ "reward_std": 0.1918664438650012,
202
+ "rewards/accuracy_reward": 0.7013393187895417,
203
+ "rewards/format_reward": 0.0,
204
+ "step": 150
205
+ },
206
+ {
207
+ "completion_length": 607.0647583007812,
208
+ "epoch": 1.9826771653543307,
209
+ "kl": 0.0038232803344726562,
210
+ "reward": 0.7250279379077256,
211
+ "reward_std": 0.17406430409755558,
212
+ "rewards/accuracy_reward": 0.7250279379077256,
213
  "rewards/format_reward": 0.0,
214
+ "step": 158,
215
  "total_flos": 0.0,
216
+ "train_loss": 0.00014615306474896194,
217
+ "train_runtime": 28501.5732,
218
+ "train_samples_per_second": 0.624,
219
+ "train_steps_per_second": 0.006
220
  }
221
  ],
222
  "logging_steps": 10,
223
+ "max_steps": 158,
224
  "num_input_tokens_seen": 0,
225
+ "num_train_epochs": 2,
226
  "save_steps": 500,
227
  "stateful_callbacks": {
228
  "TrainerControl": {
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12b648ea5b071ecce323732091588c2678cec73e93d6a189db3cc8362754dd64
3
  size 7096
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:509855e3603ba407b20fab4440b0197a9a8bb6c13a146cf9b0f47765d36950e4
3
  size 7096