tahamajs commited on
Commit
973c01d
·
verified ·
1 Parent(s): f16a61e
adapter_config.json CHANGED
@@ -13,7 +13,7 @@
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
- "lora_alpha": 64,
17
  "lora_bias": false,
18
  "lora_dropout": 0.05,
19
  "megatron_config": null,
@@ -21,17 +21,17 @@
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
23
  "qalora_group_size": 16,
24
- "r": 64,
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
- "q_proj",
 
29
  "k_proj",
 
30
  "v_proj",
31
  "o_proj",
32
- "down_proj",
33
- "up_proj",
34
- "gate_proj"
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
 
13
  "layers_pattern": null,
14
  "layers_to_transform": null,
15
  "loftq_config": {},
16
+ "lora_alpha": 8,
17
  "lora_bias": false,
18
  "lora_dropout": 0.05,
19
  "megatron_config": null,
 
21
  "modules_to_save": null,
22
  "peft_type": "LORA",
23
  "qalora_group_size": 16,
24
+ "r": 8,
25
  "rank_pattern": {},
26
  "revision": null,
27
  "target_modules": [
28
+ "up_proj",
29
+ "gate_proj",
30
  "k_proj",
31
+ "down_proj",
32
  "v_proj",
33
  "o_proj",
34
+ "q_proj"
 
 
35
  ],
36
  "target_parameters": null,
37
  "task_type": "CAUSAL_LM",
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c534c45ce58ca832785c20c9b4f57dc2d50685b91f57d526ab8eb37b0d72e8eb
3
- size 528550256
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:775041cad228f4037b9c5a3d38070e6204f18576f499f4e5d5bccc10e6b09ebd
3
+ size 66126768
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:132c0fb88b2070b782a69e8833d01ab987b1198ec606df151512d91820abb758
3
- size 11422822
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af560c5e5807192776a9f72a0ea33c9331816d9c59af61e02bdf766f8a2a97fb
3
+ size 11422922
trainer_state.json CHANGED
@@ -2,151 +2,499 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.1507537688442211,
6
- "eval_steps": 500,
7
- "global_step": 30,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "completion_length": 713.0333618164062,
14
- "completions/clipped_ratio": 0.3166666666666667,
15
- "completions/max_length": 1024.0,
16
- "completions/max_terminated_length": 788.2,
17
- "completions/mean_length": 713.0333618164062,
18
- "completions/mean_terminated_length": 567.4916625976563,
19
- "completions/min_length": 398.4,
20
- "completions/min_terminated_length": 398.4,
21
- "epoch": 0.02512562814070352,
22
- "frac_reward_zero_std": 0.7333333432674408,
23
  "grad_norm": NaN,
24
- "kl": 0.0001410087337717414,
25
- "learning_rate": 6.666666666666667e-05,
26
  "loss": 0.0,
27
- "num_tokens": 50802.0,
28
- "reward": 1.1333333730697632,
29
- "reward_std": 0.26666667461395266,
30
- "rewards/r_correctness/mean": 1.1333333492279052,
31
- "rewards/r_correctness/std": 1.002782142162323,
32
- "step": 5
 
 
 
 
 
 
 
 
33
  },
34
  {
35
- "completion_length": 582.8333435058594,
36
- "completions/clipped_ratio": 0.2,
37
- "completions/max_length": 834.8,
38
- "completions/max_terminated_length": 624.2,
39
- "completions/mean_length": 582.8333435058594,
40
- "completions/mean_terminated_length": 475.8500061035156,
41
- "completions/min_length": 364.6,
42
- "completions/min_terminated_length": 364.6,
43
- "epoch": 0.05025125628140704,
44
- "frac_reward_zero_std": 1.0,
45
  "grad_norm": NaN,
46
- "kl": 0.00023892222525319086,
47
- "learning_rate": 0.00015,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  "loss": 0.0,
49
- "num_tokens": 93036.0,
50
- "reward": 1.3333333611488343,
51
- "reward_std": 0.0,
52
- "rewards/r_correctness/mean": 1.3333333611488343,
53
- "rewards/r_correctness/std": 0.7877855658531189,
54
- "step": 10
 
 
 
 
 
 
 
 
55
  },
56
  {
57
- "completion_length": 710.3500183105468,
58
- "completions/clipped_ratio": 0.3666666666666667,
59
- "completions/max_length": 999.0,
60
- "completions/max_terminated_length": 760.6,
61
- "completions/mean_length": 710.3500183105468,
62
- "completions/mean_terminated_length": 554.9857238769531,
63
- "completions/min_length": 431.8,
64
- "completions/min_terminated_length": 431.8,
65
- "epoch": 0.07537688442211055,
66
- "frac_reward_zero_std": 0.8000000059604645,
67
  "grad_norm": NaN,
68
- "kl": 0.00023849248827900738,
69
- "learning_rate": 0.00023333333333333333,
70
  "loss": 0.0,
71
- "num_tokens": 143197.0,
72
- "reward": 1.2666666835546494,
73
- "reward_std": 0.2103133738040924,
74
- "rewards/r_correctness/mean": 1.2666666835546494,
75
- "rewards/r_correctness/std": 0.6902696490287781,
76
- "step": 15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  },
78
  {
79
- "completion_length": 665.7500244140625,
80
- "completions/clipped_ratio": 0.13333333333333333,
81
- "completions/max_length": 933.8,
82
- "completions/max_terminated_length": 877.8,
83
- "completions/mean_length": 665.7500244140625,
84
- "completions/mean_terminated_length": 611.9416870117187,
85
- "completions/min_length": 422.4,
86
- "completions/min_terminated_length": 422.4,
87
- "epoch": 0.10050251256281408,
88
- "frac_reward_zero_std": 0.7333333432674408,
89
  "grad_norm": NaN,
90
- "kl": 0.011648716410854831,
91
- "learning_rate": 0.00031666666666666665,
92
- "loss": 0.0006,
93
- "num_tokens": 190298.0,
94
- "reward": 1.6666666984558105,
95
- "reward_std": 0.26666667461395266,
96
- "rewards/r_correctness/mean": 1.6666666746139527,
97
- "rewards/r_correctness/std": 0.6490230441093445,
98
- "step": 20
 
 
 
 
 
 
 
 
99
  },
100
  {
101
- "completion_length": 725.2500244140625,
102
- "completions/clipped_ratio": 0.3,
103
- "completions/max_length": 977.2,
104
- "completions/max_terminated_length": 858.0,
105
- "completions/mean_length": 725.2500244140625,
106
- "completions/mean_terminated_length": 611.97197265625,
107
- "completions/min_length": 454.2,
108
- "completions/min_terminated_length": 454.2,
109
- "epoch": 0.12562814070351758,
110
- "frac_reward_zero_std": 0.6666666746139527,
111
  "grad_norm": NaN,
112
- "kl": 2.698609588836041e+21,
113
- "learning_rate": 0.0004,
114
- "loss": 1.349304766270523e+20,
115
- "num_tokens": 241133.0,
116
- "reward": 1.5333333373069764,
117
- "reward_std": 0.3436467111110687,
118
- "rewards/r_correctness/mean": 1.5333333373069764,
119
- "rewards/r_correctness/std": 0.5587599992752075,
120
- "step": 25
 
 
 
 
 
 
 
 
121
  },
122
  {
123
- "completion_length": 765.4000122070313,
124
- "completions/clipped_ratio": 0.4,
125
- "completions/max_length": 969.0,
126
- "completions/max_terminated_length": 763.4,
127
- "completions/mean_length": 765.4000122070313,
128
- "completions/mean_terminated_length": 634.5,
129
- "completions/min_length": 484.2,
130
- "completions/min_terminated_length": 484.2,
131
- "epoch": 0.1507537688442211,
132
- "frac_reward_zero_std": 0.5333333432674408,
133
- "grad_norm": 0.0238532405346632,
134
- "kl": 0.29226372241973875,
135
- "learning_rate": 0.00048333333333333334,
136
- "loss": 0.0146,
137
- "num_tokens": 294589.0,
138
- "reward": 1.3666667222976685,
139
- "reward_std": 0.487293416261673,
140
- "rewards/r_correctness/mean": 1.3666666507720948,
141
- "rewards/r_correctness/std": 0.923210608959198,
142
- "step": 30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
  }
144
  ],
145
- "logging_steps": 5,
146
- "max_steps": 597,
147
- "num_input_tokens_seen": 294589,
148
- "num_train_epochs": 3,
149
- "save_steps": 500,
150
  "stateful_callbacks": {
151
  "TrainerControl": {
152
  "args": {
@@ -160,7 +508,7 @@
160
  }
161
  },
162
  "total_flos": 0.0,
163
- "train_batch_size": 12,
164
  "trial_name": null,
165
  "trial_params": null
166
  }
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.008565310492505354,
6
+ "eval_steps": 1,
7
+ "global_step": 16,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "completion_length": 327.875,
14
+ "completions/clipped_ratio": 0.0,
15
+ "completions/max_length": 422.0,
16
+ "completions/max_terminated_length": 422.0,
17
+ "completions/mean_length": 327.875,
18
+ "completions/mean_terminated_length": 327.875,
19
+ "completions/min_length": 256.0,
20
+ "completions/min_terminated_length": 256.0,
21
+ "epoch": 0.0005353319057815846,
22
+ "frac_reward_zero_std": 0.0,
23
  "grad_norm": NaN,
24
+ "kl": 0.0,
25
+ "learning_rate": 0.0,
26
  "loss": 0.0,
27
+ "num_tokens": 3479.0,
28
+ "reward": 1.3822917938232422,
29
+ "reward_std": 0.001679160282947123,
30
+ "rewards/r_correctness/mean": 1.0,
31
+ "rewards/r_correctness/std": 1.0690449476242065,
32
+ "rewards/r_shaping/mean": -0.01770833320915699,
33
+ "rewards/r_shaping/std": 0.00442595174536109,
34
+ "rewards/r_soft/mean": 0.30000001192092896,
35
+ "rewards/r_soft/std": 0.0,
36
+ "rewards/r_strict/mean": 0.0,
37
+ "rewards/r_strict/std": 0.0,
38
+ "rewards/r_xmlcount/mean": 0.10000000149011612,
39
+ "rewards/r_xmlcount/std": 0.0,
40
+ "step": 1
41
  },
42
  {
43
+ "completion_length": 382.25,
44
+ "completions/clipped_ratio": 0.0,
45
+ "completions/max_length": 486.0,
46
+ "completions/max_terminated_length": 486.0,
47
+ "completions/mean_length": 382.25,
48
+ "completions/mean_terminated_length": 382.25,
49
+ "completions/min_length": 313.0,
50
+ "completions/min_terminated_length": 313.0,
51
+ "epoch": 0.0010706638115631692,
52
+ "frac_reward_zero_std": 0.0,
53
  "grad_norm": NaN,
54
+ "kl": 0.0,
55
+ "learning_rate": 0.0005,
56
+ "loss": 0.0001,
57
+ "num_tokens": 7457.0,
58
+ "reward": 2.382591724395752,
59
+ "reward_std": 0.0023273415863513947,
60
+ "rewards/r_correctness/mean": 2.0,
61
+ "rewards/r_correctness/std": 0.0,
62
+ "rewards/r_shaping/mean": -0.017408333718776703,
63
+ "rewards/r_shaping/std": 0.005641527008265257,
64
+ "rewards/r_soft/mean": 0.30000001192092896,
65
+ "rewards/r_soft/std": 0.0,
66
+ "rewards/r_strict/mean": 0.0,
67
+ "rewards/r_strict/std": 0.0,
68
+ "rewards/r_xmlcount/mean": 0.10000000149011612,
69
+ "rewards/r_xmlcount/std": 0.0,
70
+ "step": 2
71
+ },
72
+ {
73
+ "completion_length": 555.125,
74
+ "completions/clipped_ratio": 0.5,
75
+ "completions/max_length": 700.0,
76
+ "completions/max_terminated_length": 534.0,
77
+ "completions/mean_length": 555.125,
78
+ "completions/mean_terminated_length": 410.25,
79
+ "completions/min_length": 228.0,
80
+ "completions/min_terminated_length": 228.0,
81
+ "epoch": 0.0016059957173447537,
82
+ "frac_reward_zero_std": 0.0,
83
+ "grad_norm": NaN,
84
+ "kl": 0.0,
85
+ "learning_rate": 0.0004998023493068255,
86
  "loss": 0.0,
87
+ "num_tokens": 12818.0,
88
+ "reward": 1.117954134941101,
89
+ "reward_std": 0.009724103845655918,
90
+ "rewards/r_correctness/mean": 1.0,
91
+ "rewards/r_correctness/std": 1.0690449476242065,
92
+ "rewards/r_shaping/mean": -0.028920834884047508,
93
+ "rewards/r_shaping/std": 0.008277526125311852,
94
+ "rewards/r_soft/mean": 0.15000000596046448,
95
+ "rewards/r_soft/std": 0.16035674512386322,
96
+ "rewards/r_strict/mean": 0.0,
97
+ "rewards/r_strict/std": 0.0,
98
+ "rewards/r_xmlcount/mean": -0.0031250007450580597,
99
+ "rewards/r_xmlcount/std": 0.11054855585098267,
100
+ "step": 3
101
  },
102
  {
103
+ "completion_length": 633.375,
104
+ "completions/clipped_ratio": 0.125,
105
+ "completions/max_length": 700.0,
106
+ "completions/max_terminated_length": 676.0,
107
+ "completions/mean_length": 633.375,
108
+ "completions/mean_terminated_length": 623.857177734375,
109
+ "completions/min_length": 507.0,
110
+ "completions/min_terminated_length": 507.0,
111
+ "epoch": 0.0021413276231263384,
112
+ "frac_reward_zero_std": 0.0,
113
  "grad_norm": NaN,
114
+ "kl": 0.0,
115
+ "learning_rate": 0.0004992097097536739,
116
  "loss": 0.0,
117
+ "num_tokens": 18821.0,
118
+ "reward": 1.3096479177474976,
119
+ "reward_std": 0.12074954062700272,
120
+ "rewards/r_correctness/mean": 1.0,
121
+ "rewards/r_correctness/std": 1.0690449476242065,
122
+ "rewards/r_shaping/mean": -0.03097708337008953,
123
+ "rewards/r_shaping/std": 0.004845558665692806,
124
+ "rewards/r_soft/mean": 0.26250001788139343,
125
+ "rewards/r_soft/std": 0.1060660257935524,
126
+ "rewards/r_strict/mean": 0.0,
127
+ "rewards/r_strict/std": 0.0,
128
+ "rewards/r_xmlcount/mean": 0.078125,
129
+ "rewards/r_xmlcount/std": 0.06187184527516365,
130
+ "step": 4
131
+ },
132
+ {
133
+ "completion_length": 510.875,
134
+ "completions/clipped_ratio": 0.5,
135
+ "completions/max_length": 700.0,
136
+ "completions/max_terminated_length": 415.0,
137
+ "completions/mean_length": 510.875,
138
+ "completions/mean_terminated_length": 321.75,
139
+ "completions/min_length": 272.0,
140
+ "completions/min_terminated_length": 272.0,
141
+ "epoch": 0.0026766595289079227,
142
+ "frac_reward_zero_std": 0.0,
143
+ "grad_norm": NaN,
144
+ "kl": 0.0,
145
+ "learning_rate": 0.0004982230184254933,
146
+ "loss": 0.0,
147
+ "num_tokens": 23780.0,
148
+ "reward": 1.125322937965393,
149
+ "reward_std": 0.004006261937320232,
150
+ "rewards/r_correctness/mean": 1.0,
151
+ "rewards/r_correctness/std": 1.0690449476242065,
152
+ "rewards/r_shaping/mean": -0.024677084758877754,
153
+ "rewards/r_shaping/std": 0.011852074414491653,
154
+ "rewards/r_soft/mean": 0.15000000596046448,
155
+ "rewards/r_soft/std": 0.16035674512386322,
156
+ "rewards/r_strict/mean": 0.0,
157
+ "rewards/r_strict/std": 0.0,
158
+ "rewards/r_xmlcount/mean": 0.0,
159
+ "rewards/r_xmlcount/std": 0.10690450668334961,
160
+ "step": 5
161
+ },
162
+ {
163
+ "completion_length": 578.0,
164
+ "completions/clipped_ratio": 0.5,
165
+ "completions/max_length": 700.0,
166
+ "completions/max_terminated_length": 493.0,
167
+ "completions/mean_length": 578.0,
168
+ "completions/mean_terminated_length": 456.0,
169
+ "completions/min_length": 432.0,
170
+ "completions/min_terminated_length": 432.0,
171
+ "epoch": 0.0032119914346895075,
172
+ "frac_reward_zero_std": 0.5,
173
+ "grad_norm": 0.11385109275579453,
174
+ "kl": 0.0,
175
+ "learning_rate": 0.0004968438354840834,
176
+ "loss": 0.0,
177
+ "num_tokens": 29236.0,
178
+ "reward": 1.1172062158584595,
179
+ "reward_std": 0.001010744832456112,
180
+ "rewards/r_correctness/mean": 1.0,
181
+ "rewards/r_correctness/std": 1.0690449476242065,
182
+ "rewards/r_shaping/mean": -0.03279374912381172,
183
+ "rewards/r_shaping/std": 0.007816643454134464,
184
+ "rewards/r_soft/mean": 0.15000000596046448,
185
+ "rewards/r_soft/std": 0.16035674512386322,
186
+ "rewards/r_strict/mean": 0.0,
187
+ "rewards/r_strict/std": 0.0,
188
+ "rewards/r_xmlcount/mean": 0.0,
189
+ "rewards/r_xmlcount/std": 0.10690450668334961,
190
+ "step": 6
191
+ },
192
+ {
193
+ "completion_length": 565.125,
194
+ "completions/clipped_ratio": 0.5,
195
+ "completions/max_length": 700.0,
196
+ "completions/max_terminated_length": 448.0,
197
+ "completions/mean_length": 565.125,
198
+ "completions/mean_terminated_length": 430.25,
199
+ "completions/min_length": 417.0,
200
+ "completions/min_terminated_length": 417.0,
201
+ "epoch": 0.003747323340471092,
202
+ "frac_reward_zero_std": 0.5,
203
+ "grad_norm": NaN,
204
+ "kl": 0.002949223853647709,
205
+ "learning_rate": 0.0004950743417011591,
206
+ "loss": 0.0001,
207
+ "num_tokens": 34769.0,
208
+ "reward": 1.1194353103637695,
209
+ "reward_std": 0.00033387652365490794,
210
+ "rewards/r_correctness/mean": 1.0,
211
+ "rewards/r_correctness/std": 1.0690449476242065,
212
+ "rewards/r_shaping/mean": -0.030564583837985992,
213
+ "rewards/r_shaping/std": 0.010096355341374874,
214
+ "rewards/r_soft/mean": 0.15000000596046448,
215
+ "rewards/r_soft/std": 0.16035674512386322,
216
+ "rewards/r_strict/mean": 0.0,
217
+ "rewards/r_strict/std": 0.0,
218
+ "rewards/r_xmlcount/mean": 0.0,
219
+ "rewards/r_xmlcount/std": 0.10690450668334961,
220
+ "step": 7
221
+ },
222
+ {
223
+ "completion_length": 612.0,
224
+ "completions/clipped_ratio": 0.5,
225
+ "completions/max_length": 700.0,
226
+ "completions/max_terminated_length": 577.0,
227
+ "completions/mean_length": 612.0,
228
+ "completions/mean_terminated_length": 524.0,
229
+ "completions/min_length": 427.0,
230
+ "completions/min_terminated_length": 427.0,
231
+ "epoch": 0.004282655246252677,
232
+ "frac_reward_zero_std": 0.0,
233
+ "grad_norm": NaN,
234
+ "kl": 0.005263926927000284,
235
+ "learning_rate": 0.0004929173350101025,
236
+ "loss": 0.0003,
237
+ "num_tokens": 40597.0,
238
+ "reward": 1.1142561435699463,
239
+ "reward_std": 0.007277170196175575,
240
+ "rewards/r_correctness/mean": 1.0,
241
+ "rewards/r_correctness/std": 1.0690449476242065,
242
+ "rewards/r_shaping/mean": -0.032618746161460876,
243
+ "rewards/r_shaping/std": 0.00728320237249136,
244
+ "rewards/r_soft/mean": 0.15000000596046448,
245
+ "rewards/r_soft/std": 0.16035674512386322,
246
+ "rewards/r_strict/mean": 0.0,
247
+ "rewards/r_strict/std": 0.0,
248
+ "rewards/r_xmlcount/mean": -0.0031250007450580597,
249
+ "rewards/r_xmlcount/std": 0.11054855585098267,
250
+ "step": 8
251
+ },
252
+ {
253
+ "completion_length": 540.75,
254
+ "completions/clipped_ratio": 0.125,
255
+ "completions/max_length": 700.0,
256
+ "completions/max_terminated_length": 666.0,
257
+ "completions/mean_length": 540.75,
258
+ "completions/mean_terminated_length": 518.0,
259
+ "completions/min_length": 298.0,
260
+ "completions/min_terminated_length": 298.0,
261
+ "epoch": 0.004817987152034261,
262
+ "frac_reward_zero_std": 0.0,
263
+ "grad_norm": NaN,
264
+ "kl": 0.003534165909513831,
265
+ "learning_rate": 0.0004903762260818551,
266
+ "loss": 0.0002,
267
+ "num_tokens": 45663.0,
268
+ "reward": 2.0576353073120117,
269
+ "reward_std": 0.6352876424789429,
270
+ "rewards/r_correctness/mean": 1.75,
271
+ "rewards/r_correctness/std": 0.7071067690849304,
272
+ "rewards/r_shaping/mean": -0.026739582419395447,
273
+ "rewards/r_shaping/std": 0.007363913580775261,
274
+ "rewards/r_soft/mean": 0.26250001788139343,
275
+ "rewards/r_soft/std": 0.1060660257935524,
276
+ "rewards/r_strict/mean": 0.0,
277
+ "rewards/r_strict/std": 0.0,
278
+ "rewards/r_xmlcount/mean": 0.07187500596046448,
279
+ "rewards/r_xmlcount/std": 0.07954951375722885,
280
+ "step": 9
281
+ },
282
+ {
283
+ "completion_length": 471.375,
284
+ "completions/clipped_ratio": 0.0,
285
+ "completions/max_length": 593.0,
286
+ "completions/max_terminated_length": 593.0,
287
+ "completions/mean_length": 471.375,
288
+ "completions/mean_terminated_length": 471.375,
289
+ "completions/min_length": 353.0,
290
+ "completions/min_terminated_length": 353.0,
291
+ "epoch": 0.0053533190578158455,
292
+ "frac_reward_zero_std": 0.0,
293
+ "grad_norm": NaN,
294
+ "kl": 0.007448031101375818,
295
+ "learning_rate": 0.0004874550329319457,
296
+ "loss": 0.0004,
297
+ "num_tokens": 50302.0,
298
+ "reward": 2.3762893676757812,
299
+ "reward_std": 0.002844305010512471,
300
+ "rewards/r_correctness/mean": 2.0,
301
+ "rewards/r_correctness/std": 0.0,
302
+ "rewards/r_shaping/mean": -0.02371041476726532,
303
+ "rewards/r_shaping/std": 0.0028945477679371834,
304
+ "rewards/r_soft/mean": 0.30000001192092896,
305
+ "rewards/r_soft/std": 0.0,
306
+ "rewards/r_strict/mean": 0.0,
307
+ "rewards/r_strict/std": 0.0,
308
+ "rewards/r_xmlcount/mean": 0.10000000149011612,
309
+ "rewards/r_xmlcount/std": 0.0,
310
+ "step": 10
311
  },
312
  {
313
+ "completion_length": 486.875,
314
+ "completions/clipped_ratio": 0.0,
315
+ "completions/max_length": 586.0,
316
+ "completions/max_terminated_length": 586.0,
317
+ "completions/mean_length": 486.875,
318
+ "completions/mean_terminated_length": 486.875,
319
+ "completions/min_length": 404.0,
320
+ "completions/min_terminated_length": 404.0,
321
+ "epoch": 0.005888650963597431,
322
+ "frac_reward_zero_std": 0.0,
323
  "grad_norm": NaN,
324
+ "kl": 0.0019934047013521194,
325
+ "learning_rate": 0.00048415837456718195,
326
+ "loss": 0.0001,
327
+ "num_tokens": 55077.0,
328
+ "reward": 2.3770666122436523,
329
+ "reward_std": 0.0032932735048234463,
330
+ "rewards/r_correctness/mean": 2.0,
331
+ "rewards/r_correctness/std": 0.0,
332
+ "rewards/r_shaping/mean": -0.02293333411216736,
333
+ "rewards/r_shaping/std": 0.0030542060267180204,
334
+ "rewards/r_soft/mean": 0.30000001192092896,
335
+ "rewards/r_soft/std": 0.0,
336
+ "rewards/r_strict/mean": 0.0,
337
+ "rewards/r_strict/std": 0.0,
338
+ "rewards/r_xmlcount/mean": 0.10000000149011612,
339
+ "rewards/r_xmlcount/std": 0.0,
340
+ "step": 11
341
  },
342
  {
343
+ "completion_length": 520.625,
344
+ "completions/clipped_ratio": 0.125,
345
+ "completions/max_length": 700.0,
346
+ "completions/max_terminated_length": 604.0,
347
+ "completions/mean_length": 520.625,
348
+ "completions/mean_terminated_length": 495.0000305175781,
349
+ "completions/min_length": 386.0,
350
+ "completions/min_terminated_length": 386.0,
351
+ "epoch": 0.006423982869379015,
352
+ "frac_reward_zero_std": 0.0,
353
  "grad_norm": NaN,
354
+ "kl": 0.0034809389617294073,
355
+ "learning_rate": 0.0004804914636820517,
356
+ "loss": 0.0002,
357
+ "num_tokens": 60182.0,
358
+ "reward": 2.0551228523254395,
359
+ "reward_std": 0.6347294449806213,
360
+ "rewards/r_correctness/mean": 1.75,
361
+ "rewards/r_correctness/std": 0.7071067690849304,
362
+ "rewards/r_shaping/mean": -0.029252082109451294,
363
+ "rewards/r_shaping/std": 0.005904427729547024,
364
+ "rewards/r_soft/mean": 0.26250001788139343,
365
+ "rewards/r_soft/std": 0.1060660257935524,
366
+ "rewards/r_strict/mean": 0.0,
367
+ "rewards/r_strict/std": 0.0,
368
+ "rewards/r_xmlcount/mean": 0.07187500596046448,
369
+ "rewards/r_xmlcount/std": 0.07954951375722885,
370
+ "step": 12
371
  },
372
  {
373
+ "completion_length": 601.25,
374
+ "completions/clipped_ratio": 0.375,
375
+ "completions/max_length": 700.0,
376
+ "completions/max_terminated_length": 698.0,
377
+ "completions/mean_length": 601.25,
378
+ "completions/mean_terminated_length": 542.0,
379
+ "completions/min_length": 400.0,
380
+ "completions/min_terminated_length": 400.0,
381
+ "epoch": 0.006959314775160599,
382
+ "frac_reward_zero_std": 0.0,
383
+ "grad_norm": NaN,
384
+ "kl": 0.0022265929728746414,
385
+ "learning_rate": 0.00047646009841638084,
386
+ "loss": 0.0001,
387
+ "num_tokens": 66032.0,
388
+ "reward": 1.6835541725158691,
389
+ "reward_std": 0.6659948229789734,
390
+ "rewards/r_correctness/mean": 1.5,
391
+ "rewards/r_correctness/std": 0.9258201122283936,
392
+ "rewards/r_shaping/mean": -0.0320708304643631,
393
+ "rewards/r_shaping/std": 0.006900239735841751,
394
+ "rewards/r_soft/mean": 0.1875,
395
+ "rewards/r_soft/std": 0.15526476502418518,
396
+ "rewards/r_strict/mean": 0.0,
397
+ "rewards/r_strict/std": 0.0,
398
+ "rewards/r_xmlcount/mean": 0.02812499925494194,
399
+ "rewards/r_xmlcount/std": 0.0994965061545372,
400
+ "step": 13
401
+ },
402
+ {
403
+ "completion_length": 478.375,
404
+ "completions/clipped_ratio": 0.0,
405
+ "completions/max_length": 600.0,
406
+ "completions/max_terminated_length": 600.0,
407
+ "completions/mean_length": 478.375,
408
+ "completions/mean_terminated_length": 478.375,
409
+ "completions/min_length": 269.0,
410
+ "completions/min_terminated_length": 269.0,
411
+ "epoch": 0.007494646680942184,
412
+ "frac_reward_zero_std": 0.0,
413
+ "grad_norm": NaN,
414
+ "kl": 0.002636353485286236,
415
+ "learning_rate": 0.00047207065318728296,
416
+ "loss": 0.0001,
417
+ "num_tokens": 70759.0,
418
+ "reward": 2.3731250762939453,
419
+ "reward_std": 0.0061251213774085045,
420
+ "rewards/r_correctness/mean": 2.0,
421
+ "rewards/r_correctness/std": 0.0,
422
+ "rewards/r_shaping/mean": -0.026875000447034836,
423
+ "rewards/r_shaping/std": 0.006678614765405655,
424
+ "rewards/r_soft/mean": 0.30000001192092896,
425
+ "rewards/r_soft/std": 0.0,
426
+ "rewards/r_strict/mean": 0.0,
427
+ "rewards/r_strict/std": 0.0,
428
+ "rewards/r_xmlcount/mean": 0.10000000149011612,
429
+ "rewards/r_xmlcount/std": 0.0,
430
+ "step": 14
431
+ },
432
+ {
433
+ "completion_length": 367.125,
434
+ "completions/clipped_ratio": 0.0,
435
+ "completions/max_length": 524.0,
436
+ "completions/max_terminated_length": 524.0,
437
+ "completions/mean_length": 367.125,
438
+ "completions/mean_terminated_length": 367.125,
439
+ "completions/min_length": 214.0,
440
+ "completions/min_terminated_length": 214.0,
441
+ "epoch": 0.008029978586723769,
442
+ "frac_reward_zero_std": 0.0,
443
+ "grad_norm": NaN,
444
+ "kl": 0.005754380952566862,
445
+ "learning_rate": 0.00046733006860989566,
446
+ "loss": 0.0003,
447
+ "num_tokens": 74608.0,
448
+ "reward": 2.3812456130981445,
449
+ "reward_std": 0.0021071869414299726,
450
+ "rewards/r_correctness/mean": 2.0,
451
+ "rewards/r_correctness/std": 0.0,
452
+ "rewards/r_shaping/mean": -0.01875416561961174,
453
+ "rewards/r_shaping/std": 0.005434602499008179,
454
+ "rewards/r_soft/mean": 0.30000001192092896,
455
+ "rewards/r_soft/std": 0.0,
456
+ "rewards/r_strict/mean": 0.0,
457
+ "rewards/r_strict/std": 0.0,
458
+ "rewards/r_xmlcount/mean": 0.10000000149011612,
459
+ "rewards/r_xmlcount/std": 0.0,
460
+ "step": 15
461
+ },
462
+ {
463
+ "completion_length": 515.875,
464
+ "completions/clipped_ratio": 0.5,
465
+ "completions/max_length": 700.0,
466
+ "completions/max_terminated_length": 366.0,
467
+ "completions/mean_length": 515.875,
468
+ "completions/mean_terminated_length": 331.75,
469
+ "completions/min_length": 296.0,
470
+ "completions/min_terminated_length": 296.0,
471
+ "epoch": 0.008565310492505354,
472
+ "frac_reward_zero_std": 0.5,
473
+ "grad_norm": NaN,
474
+ "kl": 0.0027227874379605055,
475
+ "learning_rate": 0.000462245840522841,
476
+ "loss": 0.0001,
477
+ "num_tokens": 79783.0,
478
+ "reward": 1.1220020055770874,
479
+ "reward_std": 0.0010650103213265538,
480
+ "rewards/r_correctness/mean": 1.0,
481
+ "rewards/r_correctness/std": 1.0690449476242065,
482
+ "rewards/r_shaping/mean": -0.02799791656434536,
483
+ "rewards/r_shaping/std": 0.012906314805150032,
484
+ "rewards/r_soft/mean": 0.15000000596046448,
485
+ "rewards/r_soft/std": 0.16035674512386322,
486
+ "rewards/r_strict/mean": 0.0,
487
+ "rewards/r_strict/std": 0.0,
488
+ "rewards/r_xmlcount/mean": 0.0,
489
+ "rewards/r_xmlcount/std": 0.10690450668334961,
490
+ "step": 16
491
  }
492
  ],
493
+ "logging_steps": 1,
494
+ "max_steps": 80,
495
+ "num_input_tokens_seen": 79783,
496
+ "num_train_epochs": 1,
497
+ "save_steps": 50,
498
  "stateful_callbacks": {
499
  "TrainerControl": {
500
  "args": {
 
508
  }
509
  },
510
  "total_flos": 0.0,
511
+ "train_batch_size": 8,
512
  "trial_name": null,
513
  "trial_params": null
514
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3c632b7ef7f4c3a96d793238debf3b3c148eefeafd13a0eb352fc99bdfc3a53
3
  size 7057
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8aae4ae9cab15fd072a558cc4adfaae1d03da7de7967e80a548c0da40926b282
3
  size 7057