jzhang-x commited on
Commit
b8d2252
·
verified ·
1 Parent(s): 597640b

Model save

Browse files
README.md CHANGED
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jzhang-x-01/huggingface/runs/f17te69y)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/jzhang-x-01/huggingface/runs/l1ih86fe)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00041823586402218955,
4
- "train_runtime": 8141.2102,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.921,
7
  "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.002154490305616204,
4
+ "train_runtime": 8228.6792,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.911,
7
  "train_steps_per_second": 0.007
8
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d372229b5765122f0f665a6bc9ea66c8e501cade1deea988f753f4546d8c01e2
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da4320c5ba6d132b13e0697de200379e6696bb453e297dd29a845b6cf0a5ad11
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cdf88cc467578d4d66756ce7b0778b49989445997baecfa542f50a7df748a535
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb5180786ab5e7eb5c36cbfc486ba5d1a87942134f42b637939668da6e4b7b68
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fb9ae13bc1ac27d2fe51b43d97591e1d9762abde6837c547e2c52d5b61fca3bd
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c5df4aabc965da16054c18701d906c2afcfea880192d0a5ac4c9759ba58ae05
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed2f6405de1c7e3a363f3541d86a91cc532a81425d2727016b466c48f17be4a9
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aac5bbf08545411482d72b15e3233be68a3e299b0f204d9e15b514f68fa374df
3
  size 1089994880
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.00041823586402218955,
4
- "train_runtime": 8141.2102,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.921,
7
  "train_steps_per_second": 0.007
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.002154490305616204,
4
+ "train_runtime": 8228.6792,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.911,
7
  "train_steps_per_second": 0.007
8
  }
trainer_state.json CHANGED
@@ -9,9 +9,9 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "completion_length": 606.4564895629883,
13
  "epoch": 0.017057569296375266,
14
- "grad_norm": 5.349207388441972,
15
  "kl": 0.0,
16
  "learning_rate": 5e-07,
17
  "loss": 0.0,
@@ -22,161 +22,161 @@
22
  "step": 1
23
  },
24
  {
25
- "completion_length": 603.4832859039307,
26
  "epoch": 0.08528784648187633,
27
- "grad_norm": 9.588701812058716,
28
- "kl": 0.00046131014823913574,
29
  "learning_rate": 2.5e-06,
30
  "loss": 0.0,
31
- "reward": 0.6004464514553547,
32
- "reward_std": 0.3618758851662278,
33
- "rewards/accuracy_reward": 0.6004464514553547,
34
  "rewards/format_reward": 0.0,
35
  "step": 5
36
  },
37
  {
38
- "completion_length": 620.3982391357422,
39
  "epoch": 0.17057569296375266,
40
- "grad_norm": 0.4408238121081561,
41
- "kl": 0.016131591796875,
42
  "learning_rate": 2.956412726139078e-06,
43
- "loss": 0.0006,
44
- "reward": 0.6678571753203869,
45
- "reward_std": 0.3235505763441324,
46
- "rewards/accuracy_reward": 0.6678571753203869,
47
  "rewards/format_reward": 0.0,
48
  "step": 10
49
  },
50
  {
51
- "completion_length": 636.4580642700196,
52
  "epoch": 0.255863539445629,
53
- "grad_norm": 70.0216466198137,
54
- "kl": 0.021285057067871094,
55
  "learning_rate": 2.7836719084521715e-06,
56
- "loss": 0.0009,
57
- "reward": 0.7464286029338837,
58
- "reward_std": 0.24510460048913957,
59
- "rewards/accuracy_reward": 0.7464286029338837,
60
  "rewards/format_reward": 0.0,
61
  "step": 15
62
  },
63
  {
64
- "completion_length": 617.2065002441407,
65
  "epoch": 0.3411513859275053,
66
- "grad_norm": 0.12452084814574951,
67
- "kl": 0.004275131225585938,
68
  "learning_rate": 2.4946839873611927e-06,
69
- "loss": 0.0002,
70
- "reward": 0.752678605914116,
71
- "reward_std": 0.22502579726278782,
72
- "rewards/accuracy_reward": 0.752678605914116,
73
  "rewards/format_reward": 0.0,
74
  "step": 20
75
  },
76
  {
77
- "completion_length": 620.6533752441406,
78
  "epoch": 0.42643923240938164,
79
- "grad_norm": 0.4497624610789142,
80
- "kl": 0.006324386596679688,
81
  "learning_rate": 2.1156192081791355e-06,
82
  "loss": 0.0003,
83
- "reward": 0.7526786029338837,
84
- "reward_std": 0.20553307328373194,
85
- "rewards/accuracy_reward": 0.7526786029338837,
86
  "rewards/format_reward": 0.0,
87
  "step": 25
88
  },
89
  {
90
- "completion_length": 617.4848472595215,
91
  "epoch": 0.511727078891258,
92
- "grad_norm": 7.695959344352079,
93
- "kl": 0.014894866943359375,
94
  "learning_rate": 1.6808050203829845e-06,
95
- "loss": 0.0006,
96
- "reward": 0.7448661029338837,
97
- "reward_std": 0.19868585970252753,
98
- "rewards/accuracy_reward": 0.7448661029338837,
99
  "rewards/format_reward": 0.0,
100
  "step": 30
101
  },
102
  {
103
- "completion_length": 612.6093978881836,
104
  "epoch": 0.5970149253731343,
105
- "grad_norm": 0.3012279035059688,
106
- "kl": 0.031812286376953124,
107
  "learning_rate": 1.2296174432791415e-06,
108
- "loss": 0.0013,
109
- "reward": 0.739732176065445,
110
- "reward_std": 0.19879167079925536,
111
- "rewards/accuracy_reward": 0.739732176065445,
112
  "rewards/format_reward": 0.0,
113
  "step": 35
114
  },
115
  {
116
- "completion_length": 597.8221229553222,
117
  "epoch": 0.6823027718550106,
118
- "grad_norm": 0.13735068646343107,
119
- "kl": 0.00665283203125,
120
  "learning_rate": 8.029152419343472e-07,
121
  "loss": 0.0003,
122
- "reward": 0.7627232506871223,
123
- "reward_std": 0.20173386242240668,
124
- "rewards/accuracy_reward": 0.7627232506871223,
125
  "rewards/format_reward": 0.0,
126
  "step": 40
127
  },
128
  {
129
- "completion_length": 607.5589538574219,
130
  "epoch": 0.767590618336887,
131
- "grad_norm": 0.09444476728427914,
132
- "kl": 0.006229019165039063,
133
  "learning_rate": 4.3933982822017883e-07,
134
- "loss": 0.0002,
135
- "reward": 0.752901816368103,
136
- "reward_std": 0.19051024238578976,
137
- "rewards/accuracy_reward": 0.752901816368103,
138
  "rewards/format_reward": 0.0,
139
  "step": 45
140
  },
141
  {
142
- "completion_length": 609.9201133728027,
143
  "epoch": 0.8528784648187633,
144
- "grad_norm": 0.13825347710108976,
145
- "kl": 0.005415916442871094,
146
  "learning_rate": 1.718159615201853e-07,
147
- "loss": 0.0002,
148
- "reward": 0.7462053894996643,
149
- "reward_std": 0.19213472940027715,
150
- "rewards/accuracy_reward": 0.7462053894996643,
151
  "rewards/format_reward": 0.0,
152
  "step": 50
153
  },
154
  {
155
- "completion_length": 603.0685539245605,
156
  "epoch": 0.9381663113006397,
157
- "grad_norm": 0.11004891600095792,
158
- "kl": 0.00497283935546875,
159
  "learning_rate": 2.4570139579284723e-08,
160
- "loss": 0.0002,
161
- "reward": 0.7816964641213417,
162
- "reward_std": 0.19411077070981264,
163
- "rewards/accuracy_reward": 0.7816964641213417,
164
  "rewards/format_reward": 0.0,
165
  "step": 55
166
  },
167
  {
168
- "completion_length": 591.0379651387533,
169
  "epoch": 0.9893390191897654,
170
- "kl": 0.0048402150472005205,
171
- "reward": 0.7686012263099352,
172
- "reward_std": 0.16828599898144603,
173
- "rewards/accuracy_reward": 0.7686012263099352,
174
  "rewards/format_reward": 0.0,
175
  "step": 58,
176
  "total_flos": 0.0,
177
- "train_loss": 0.00041823586402218955,
178
- "train_runtime": 8141.2102,
179
- "train_samples_per_second": 0.921,
180
  "train_steps_per_second": 0.007
181
  }
182
  ],
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "completion_length": 606.5457763671875,
13
  "epoch": 0.017057569296375266,
14
+ "grad_norm": 5.351304250136567,
15
  "kl": 0.0,
16
  "learning_rate": 5e-07,
17
  "loss": 0.0,
 
22
  "step": 1
23
  },
24
  {
25
+ "completion_length": 607.6144218444824,
26
  "epoch": 0.08528784648187633,
27
+ "grad_norm": 11.35565102634353,
28
+ "kl": 0.00033593177795410156,
29
  "learning_rate": 2.5e-06,
30
  "loss": 0.0,
31
+ "reward": 0.6026785979047418,
32
+ "reward_std": 0.3518537702038884,
33
+ "rewards/accuracy_reward": 0.6026785979047418,
34
  "rewards/format_reward": 0.0,
35
  "step": 5
36
  },
37
  {
38
+ "completion_length": 622.7357368469238,
39
  "epoch": 0.17057569296375266,
40
+ "grad_norm": 1.9080536354821958,
41
+ "kl": 0.004211950302124024,
42
  "learning_rate": 2.956412726139078e-06,
43
+ "loss": 0.0002,
44
+ "reward": 0.6660714566707611,
45
+ "reward_std": 0.31057145930826663,
46
+ "rewards/accuracy_reward": 0.6660714566707611,
47
  "rewards/format_reward": 0.0,
48
  "step": 10
49
  },
50
  {
51
+ "completion_length": 631.8491333007812,
52
  "epoch": 0.255863539445629,
53
+ "grad_norm": 0.20168403792141756,
54
+ "kl": 0.007109832763671875,
55
  "learning_rate": 2.7836719084521715e-06,
56
+ "loss": 0.0003,
57
+ "reward": 0.7424107462167739,
58
+ "reward_std": 0.2543599892407656,
59
+ "rewards/accuracy_reward": 0.7424107462167739,
60
  "rewards/format_reward": 0.0,
61
  "step": 15
62
  },
63
  {
64
+ "completion_length": 622.2803810119628,
65
  "epoch": 0.3411513859275053,
66
+ "grad_norm": 0.495428185088676,
67
+ "kl": 0.02277984619140625,
68
  "learning_rate": 2.4946839873611927e-06,
69
+ "loss": 0.0009,
70
+ "reward": 0.7529018178582192,
71
+ "reward_std": 0.23195756375789642,
72
+ "rewards/accuracy_reward": 0.7529018178582192,
73
  "rewards/format_reward": 0.0,
74
  "step": 20
75
  },
76
  {
77
+ "completion_length": 629.6663253784179,
78
  "epoch": 0.42643923240938164,
79
+ "grad_norm": 0.2077076072198444,
80
+ "kl": 0.0070018768310546875,
81
  "learning_rate": 2.1156192081791355e-06,
82
  "loss": 0.0003,
83
+ "reward": 0.7497768178582191,
84
+ "reward_std": 0.22464404683560132,
85
+ "rewards/accuracy_reward": 0.7497768178582191,
86
  "rewards/format_reward": 0.0,
87
  "step": 25
88
  },
89
  {
90
+ "completion_length": 628.7462333679199,
91
  "epoch": 0.511727078891258,
92
+ "grad_norm": 223.6982144990451,
93
+ "kl": 0.3876213073730469,
94
  "learning_rate": 1.6808050203829845e-06,
95
+ "loss": 0.0155,
96
+ "reward": 0.7379464656114578,
97
+ "reward_std": 0.21265463214367628,
98
+ "rewards/accuracy_reward": 0.7379464656114578,
99
  "rewards/format_reward": 0.0,
100
  "step": 30
101
  },
102
  {
103
+ "completion_length": 614.8966758728027,
104
  "epoch": 0.5970149253731343,
105
+ "grad_norm": 2.3665634575263748,
106
+ "kl": 0.005752182006835938,
107
  "learning_rate": 1.2296174432791415e-06,
108
+ "loss": 0.0002,
109
+ "reward": 0.7314732424914837,
110
+ "reward_std": 0.19922155924141408,
111
+ "rewards/accuracy_reward": 0.7314732424914837,
112
  "rewards/format_reward": 0.0,
113
  "step": 35
114
  },
115
  {
116
+ "completion_length": 603.5955581665039,
117
  "epoch": 0.6823027718550106,
118
+ "grad_norm": 0.41861750967507205,
119
+ "kl": 0.007678604125976563,
120
  "learning_rate": 8.029152419343472e-07,
121
  "loss": 0.0003,
122
+ "reward": 0.7553571775555611,
123
+ "reward_std": 0.20080968737602234,
124
+ "rewards/accuracy_reward": 0.7553571775555611,
125
  "rewards/format_reward": 0.0,
126
  "step": 40
127
  },
128
  {
129
+ "completion_length": 619.6895401000977,
130
  "epoch": 0.767590618336887,
131
+ "grad_norm": 0.5526088012366769,
132
+ "kl": 0.007487106323242188,
133
  "learning_rate": 4.3933982822017883e-07,
134
+ "loss": 0.0003,
135
+ "reward": 0.725000037252903,
136
+ "reward_std": 0.23073445297777653,
137
+ "rewards/accuracy_reward": 0.725000037252903,
138
  "rewards/format_reward": 0.0,
139
  "step": 45
140
  },
141
  {
142
+ "completion_length": 613.0727882385254,
143
  "epoch": 0.8528784648187633,
144
+ "grad_norm": 1.1048247702884233,
145
+ "kl": 0.00660247802734375,
146
  "learning_rate": 1.718159615201853e-07,
147
+ "loss": 0.0003,
148
+ "reward": 0.7332589656114579,
149
+ "reward_std": 0.19824975840747355,
150
+ "rewards/accuracy_reward": 0.7332589656114579,
151
  "rewards/format_reward": 0.0,
152
  "step": 50
153
  },
154
  {
155
+ "completion_length": 608.9152099609375,
156
  "epoch": 0.9381663113006397,
157
+ "grad_norm": 0.688326952525324,
158
+ "kl": 0.006847000122070313,
159
  "learning_rate": 2.4570139579284723e-08,
160
+ "loss": 0.0003,
161
+ "reward": 0.7656250342726707,
162
+ "reward_std": 0.21631054822355508,
163
+ "rewards/accuracy_reward": 0.7656250342726707,
164
  "rewards/format_reward": 0.0,
165
  "step": 55
166
  },
167
  {
168
+ "completion_length": 603.1421381632487,
169
  "epoch": 0.9893390191897654,
170
+ "kl": 0.0070037841796875,
171
+ "reward": 0.751860149204731,
172
+ "reward_std": 0.20624662407984337,
173
+ "rewards/accuracy_reward": 0.751860149204731,
174
  "rewards/format_reward": 0.0,
175
  "step": 58,
176
  "total_flos": 0.0,
177
+ "train_loss": 0.002154490305616204,
178
+ "train_runtime": 8228.6792,
179
+ "train_samples_per_second": 0.911,
180
  "train_steps_per_second": 0.007
181
  }
182
  ],
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a102f47943bd94aa7ecc64e47c8daa56e3ea0e2ce38b52d8866da5a59dd6ffd
3
  size 7928
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:095c9ce79540846bb0717018dbf2b9b1f9126d535c416a479f2bd2e2f09039f3
3
  size 7928