Luuyin commited on
Commit
9bb0b8d
·
verified ·
1 Parent(s): ceaeaec

Model save

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: Qwen/Qwen2.5-Math-7B
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B-Simple-RL
5
  tags:
@@ -11,7 +11,7 @@ licence: license
11
 
12
  # Model Card for Qwen-2.5-7B-Simple-RL
13
 
14
- This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
@@ -27,7 +27,7 @@ print(output["generated_text"])
27
 
28
  ## Training procedure
29
 
30
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yinluu-cn/huggingface/runs/03s5hgsp)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
 
1
  ---
2
+ base_model: Qwen/Qwen2.5-Math-1.5B-Instruct
3
  library_name: transformers
4
  model_name: Qwen-2.5-7B-Simple-RL
5
  tags:
 
11
 
12
  # Model Card for Qwen-2.5-7B-Simple-RL
13
 
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Math-1.5B-Instruct).
15
  It has been trained using [TRL](https://github.com/huggingface/trl).
16
 
17
  ## Quick start
 
27
 
28
  ## Training procedure
29
 
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/yinluu-cn/huggingface/runs/h7vr8p5k)
31
 
32
 
33
  This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.06954928480375272,
4
- "train_runtime": 34706.12,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.216,
7
- "train_steps_per_second": 0.009
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.018009291775785804,
4
+ "train_runtime": 13979.803,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.536,
7
+ "train_steps_per_second": 0.022
8
  }
config.json CHANGED
@@ -1,30 +1,29 @@
1
  {
2
- "_name_or_path": "Qwen/Qwen2.5-Math-7B",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
- "eos_token_id": 151643,
9
  "hidden_act": "silu",
10
- "hidden_size": 3584,
11
  "initializer_range": 0.02,
12
- "intermediate_size": 18944,
13
  "max_position_embeddings": 4096,
14
- "max_window_layers": 28,
15
  "model_type": "qwen2",
16
- "num_attention_heads": 28,
17
  "num_hidden_layers": 28,
18
- "num_key_value_heads": 4,
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": null,
21
- "rope_theta": 10000,
22
  "sliding_window": 4096,
23
- "tie_word_embeddings": false,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
  "use_cache": false,
27
- "use_mrope": false,
28
  "use_sliding_window": false,
29
- "vocab_size": 152064
30
  }
 
1
  {
2
+ "_name_or_path": "Qwen/Qwen2.5-Math-1.5B-Instruct",
3
  "architectures": [
4
  "Qwen2ForCausalLM"
5
  ],
6
  "attention_dropout": 0.0,
7
  "bos_token_id": 151643,
8
+ "eos_token_id": 151645,
9
  "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
  "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
  "max_position_embeddings": 4096,
14
+ "max_window_layers": 21,
15
  "model_type": "qwen2",
16
+ "num_attention_heads": 12,
17
  "num_hidden_layers": 28,
18
+ "num_key_value_heads": 2,
19
  "rms_norm_eps": 1e-06,
20
  "rope_scaling": null,
21
+ "rope_theta": 10000.0,
22
  "sliding_window": 4096,
23
+ "tie_word_embeddings": true,
24
  "torch_dtype": "bfloat16",
25
  "transformers_version": "4.49.0",
26
  "use_cache": false,
 
27
  "use_sliding_window": false,
28
+ "vocab_size": 151936
29
  }
generation_config.json CHANGED
@@ -1,6 +1,9 @@
1
  {
2
  "bos_token_id": 151643,
3
- "eos_token_id": 151643,
4
- "max_new_tokens": 2048,
 
 
 
5
  "transformers_version": "4.49.0"
6
  }
 
1
  {
2
  "bos_token_id": 151643,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
  "transformers_version": "4.49.0"
9
  }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb7359282e727ae3eb613b036eef7f64a28cab27ec48f83ae15a48c9aad06468
3
+ size 3087467144
special_tokens_map.json CHANGED
@@ -15,7 +15,7 @@
15
  "<|video_pad|>"
16
  ],
17
  "eos_token": {
18
- "content": "<|endoftext|>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
 
15
  "<|video_pad|>"
16
  ],
17
  "eos_token": {
18
+ "content": "<|im_end|>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -197,7 +197,7 @@
197
  "bos_token": null,
198
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
  "clean_up_tokenization_spaces": false,
200
- "eos_token": "<|endoftext|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
  "model_max_length": 131072,
 
197
  "bos_token": null,
198
  "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'Please reason step by step, and put your final answer within \\\\boxed{}.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nPlease reason step by step, and put your final answer within \\\\boxed{}.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
  "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
  "errors": "replace",
202
  "extra_special_tokens": {},
203
  "model_max_length": 131072,
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "total_flos": 0.0,
3
- "train_loss": 0.06954928480375272,
4
- "train_runtime": 34706.12,
5
  "train_samples": 7500,
6
- "train_samples_per_second": 0.216,
7
- "train_steps_per_second": 0.009
8
  }
 
1
  {
2
  "total_flos": 0.0,
3
+ "train_loss": 0.018009291775785804,
4
+ "train_runtime": 13979.803,
5
  "train_samples": 7500,
6
+ "train_samples_per_second": 0.536,
7
+ "train_steps_per_second": 0.022
8
  }
trainer_state.json CHANGED
@@ -10,925 +10,925 @@
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
- "completion_length": 569.2916870117188,
14
  "epoch": 0.0032,
15
- "grad_norm": 0.6810210347175598,
16
  "kl": 0.0,
17
  "learning_rate": 9.375e-08,
18
- "loss": -0.0142,
19
- "reward": 0.6250000149011612,
20
- "reward_std": 0.28694797679781914,
21
- "rewards/accuracy_reward": 0.6250000149011612,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
- "completion_length": 589.1067867279053,
28
  "epoch": 0.016,
29
- "grad_norm": 0.5563701391220093,
30
- "kl": 0.00018343329429626465,
31
  "learning_rate": 4.6875e-07,
32
- "loss": -0.0273,
33
- "reward": 0.6406250172294676,
34
- "reward_std": 0.3286146428436041,
35
- "rewards/accuracy_reward": 0.6406250172294676,
36
  "rewards/format_reward": 0.0,
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
- "completion_length": 589.9229347229004,
42
  "epoch": 0.032,
43
- "grad_norm": 0.2431538701057434,
44
- "kl": 0.00020442008972167968,
45
  "learning_rate": 9.375e-07,
46
- "loss": -0.0048,
47
- "reward": 0.6479166872799397,
48
- "reward_std": 0.2744479771703482,
49
- "rewards/accuracy_reward": 0.6479166872799397,
50
  "rewards/format_reward": 0.0,
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
- "completion_length": 609.5458503723145,
56
  "epoch": 0.048,
57
- "grad_norm": 0.9991121888160706,
58
- "kl": 0.0006866693496704101,
59
  "learning_rate": 1.40625e-06,
60
- "loss": 0.0165,
61
- "reward": 0.6395833514630794,
62
- "reward_std": 0.31963672377169133,
63
- "rewards/accuracy_reward": 0.6395833514630794,
64
  "rewards/format_reward": 0.0,
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
- "completion_length": 639.2354362487793,
70
  "epoch": 0.064,
71
- "grad_norm": 1.0517014265060425,
72
- "kl": 0.6095457077026367,
73
  "learning_rate": 1.875e-06,
74
- "loss": -0.0132,
75
- "reward": 0.6333333514630795,
76
- "reward_std": 0.311947975680232,
77
- "rewards/accuracy_reward": 0.6333333514630795,
78
  "rewards/format_reward": 0.0,
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
- "completion_length": 604.0208541870118,
84
  "epoch": 0.08,
85
- "grad_norm": 0.4394618272781372,
86
- "kl": 0.006474208831787109,
87
  "learning_rate": 2.3437500000000002e-06,
88
- "loss": 0.0381,
89
- "reward": 0.6479166842997074,
90
- "reward_std": 0.22504628337919713,
91
- "rewards/accuracy_reward": 0.6479166842997074,
92
  "rewards/format_reward": 0.0,
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
- "completion_length": 595.7833572387696,
98
  "epoch": 0.096,
99
- "grad_norm": 7.958812236785889,
100
- "kl": 0.017683982849121094,
101
  "learning_rate": 2.8125e-06,
102
- "loss": 0.0652,
103
- "reward": 0.7229166865348816,
104
- "reward_std": 0.2058012742549181,
105
- "rewards/accuracy_reward": 0.7229166865348816,
106
  "rewards/format_reward": 0.0,
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
- "completion_length": 620.6187690734863,
112
  "epoch": 0.112,
113
- "grad_norm": 0.2736396789550781,
114
- "kl": 0.006215381622314453,
115
  "learning_rate": 2.9991503375003e-06,
116
- "loss": 0.0515,
117
- "reward": 0.7458333507180214,
118
- "reward_std": 0.18754628226161002,
119
- "rewards/accuracy_reward": 0.7458333507180214,
120
  "rewards/format_reward": 0.0,
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
- "completion_length": 613.6250183105469,
126
  "epoch": 0.128,
127
- "grad_norm": 0.28497475385665894,
128
- "kl": 0.9405881881713867,
129
  "learning_rate": 2.993961440992859e-06,
130
- "loss": 0.0365,
131
- "reward": 0.7520833522081375,
132
- "reward_std": 0.19394586011767387,
133
- "rewards/accuracy_reward": 0.7520833522081375,
134
  "rewards/format_reward": 0.0,
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
- "completion_length": 599.8812660217285,
140
  "epoch": 0.144,
141
- "grad_norm": 0.5031656622886658,
142
- "kl": 0.005823993682861328,
143
  "learning_rate": 2.984071989079555e-06,
144
- "loss": 0.0255,
145
- "reward": 0.7500000139698386,
146
- "reward_std": 0.16220085099339485,
147
- "rewards/accuracy_reward": 0.7500000139698386,
148
  "rewards/format_reward": 0.0,
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
- "completion_length": 597.2396011352539,
154
  "epoch": 0.16,
155
- "grad_norm": 0.21613067388534546,
156
- "kl": 0.0038125991821289064,
157
  "learning_rate": 2.9695130976348534e-06,
158
- "loss": 0.0487,
159
- "reward": 0.7708333551883697,
160
- "reward_std": 0.17311252616345882,
161
- "rewards/accuracy_reward": 0.7708333551883697,
162
  "rewards/format_reward": 0.0,
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
- "completion_length": 601.7229354858398,
168
  "epoch": 0.176,
169
- "grad_norm": 1.0250929594039917,
170
- "kl": 0.009112930297851563,
171
  "learning_rate": 2.9503305743175096e-06,
172
- "loss": 0.0547,
173
- "reward": 0.7187500186264515,
174
- "reward_std": 0.20966878794133664,
175
- "rewards/accuracy_reward": 0.7187500186264515,
176
  "rewards/format_reward": 0.0,
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
- "completion_length": 596.8396026611329,
182
  "epoch": 0.192,
183
- "grad_norm": 0.30940911173820496,
184
- "kl": 0.0079010009765625,
185
  "learning_rate": 2.9265847744427307e-06,
186
- "loss": 0.0128,
187
- "reward": 0.7854166835546493,
188
- "reward_std": 0.17117876932024956,
189
- "rewards/accuracy_reward": 0.7854166835546493,
190
  "rewards/format_reward": 0.0,
191
  "step": 60
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
- "completion_length": 538.6895942687988,
196
  "epoch": 0.208,
197
- "grad_norm": 0.17191636562347412,
198
- "kl": 0.0058624267578125,
199
  "learning_rate": 2.8983504110820214e-06,
200
- "loss": 0.0163,
201
- "reward": 0.8041666835546494,
202
- "reward_std": 0.15163460709154605,
203
- "rewards/accuracy_reward": 0.8041666835546494,
204
  "rewards/format_reward": 0.0,
205
  "step": 65
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
- "completion_length": 550.6395980834961,
210
  "epoch": 0.224,
211
- "grad_norm": 0.4740332365036011,
212
- "kl": 0.007071876525878906,
213
  "learning_rate": 2.865716319988224e-06,
214
- "loss": 0.035,
215
- "reward": 0.7979166805744171,
216
- "reward_std": 0.15227919220924377,
217
- "rewards/accuracy_reward": 0.7979166805744171,
218
  "rewards/format_reward": 0.0,
219
  "step": 70
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
- "completion_length": 578.7104339599609,
224
  "epoch": 0.24,
225
- "grad_norm": 0.4841112196445465,
226
- "kl": 0.005505561828613281,
227
  "learning_rate": 2.82878518008537e-06,
228
- "loss": 0.0154,
229
- "reward": 0.7520833503454923,
230
- "reward_std": 0.15932335443794726,
231
- "rewards/accuracy_reward": 0.7520833503454923,
232
  "rewards/format_reward": 0.0,
233
  "step": 75
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
- "completion_length": 570.0229293823243,
238
  "epoch": 0.256,
239
- "grad_norm": 2.4007749557495117,
240
- "kl": 2.4198307037353515,
241
  "learning_rate": 2.7876731904027993e-06,
242
- "loss": 0.0991,
243
- "reward": 0.7916666835546493,
244
- "reward_std": 0.14682335481047631,
245
- "rewards/accuracy_reward": 0.7916666835546493,
246
  "rewards/format_reward": 0.0,
247
  "step": 80
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
- "completion_length": 539.7104286193847,
252
  "epoch": 0.272,
253
- "grad_norm": 0.5474560260772705,
254
- "kl": 0.008501434326171875,
255
  "learning_rate": 2.7425097044700246e-06,
256
- "loss": 0.0205,
257
- "reward": 0.8125000163912773,
258
- "reward_std": 0.16254628226161003,
259
- "rewards/accuracy_reward": 0.8125000163912773,
260
  "rewards/format_reward": 0.0,
261
  "step": 85
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
- "completion_length": 608.4396018981934,
266
  "epoch": 0.288,
267
- "grad_norm": 0.7235979437828064,
268
- "kl": 0.33249053955078123,
269
  "learning_rate": 2.6934368233226715e-06,
270
- "loss": 0.055,
271
- "reward": 0.7354166839271784,
272
- "reward_std": 0.17375711128115653,
273
- "rewards/accuracy_reward": 0.7354166839271784,
274
  "rewards/format_reward": 0.0,
275
  "step": 90
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
- "completion_length": 570.7458480834961,
280
  "epoch": 0.304,
281
- "grad_norm": 0.2708403766155243,
282
- "kl": 0.007986068725585938,
283
  "learning_rate": 2.6406089484000465e-06,
284
- "loss": 0.0253,
285
- "reward": 0.7791666805744171,
286
- "reward_std": 0.1433012720197439,
287
- "rewards/accuracy_reward": 0.7791666805744171,
288
  "rewards/format_reward": 0.0,
289
  "step": 95
290
  },
291
  {
292
  "epoch": 0.32,
293
- "grad_norm": 0.38377681374549866,
294
  "learning_rate": 2.584192295741087e-06,
295
- "loss": 0.0363,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 0.32,
300
  "eval_clip_ratio": 0.0,
301
- "eval_completion_length": 512.997021064073,
302
- "eval_kl": 0.035927641177605725,
303
- "eval_loss": 0.009229963645339012,
304
- "eval_reward": 0.8156187802017806,
305
- "eval_reward_std": 0.1942277729154347,
306
- "eval_rewards/accuracy_reward": 0.8156187802017806,
307
  "eval_rewards/format_reward": 0.0,
308
- "eval_runtime": 2950.3108,
309
- "eval_samples_per_second": 0.339,
310
- "eval_steps_per_second": 0.028,
311
  "step": 100
312
  },
313
  {
314
  "clip_ratio": 0.0,
315
- "completion_length": 570.2916835784912,
316
  "epoch": 0.336,
317
- "grad_norm": 0.5292491316795349,
318
- "kl": 0.00650491714477539,
319
  "learning_rate": 2.5243643730072105e-06,
320
- "loss": 0.0256,
321
- "reward": 0.7666666842997074,
322
- "reward_std": 0.17311252579092978,
323
- "rewards/accuracy_reward": 0.7666666842997074,
324
  "rewards/format_reward": 0.0,
325
  "step": 105
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
- "completion_length": 570.495849609375,
330
  "epoch": 0.352,
331
- "grad_norm": 0.30810341238975525,
332
- "kl": 0.01353607177734375,
333
  "learning_rate": 2.461313420977536e-06,
334
- "loss": 0.0062,
335
- "reward": 0.7770833469927311,
336
- "reward_std": 0.18913460709154606,
337
- "rewards/accuracy_reward": 0.7770833469927311,
338
  "rewards/format_reward": 0.0,
339
  "step": 110
340
  },
341
  {
342
  "clip_ratio": 0.0,
343
- "completion_length": 559.052098083496,
344
  "epoch": 0.368,
345
- "grad_norm": 0.49775150418281555,
346
- "kl": 0.007056427001953125,
347
  "learning_rate": 2.3952378212737554e-06,
348
- "loss": 0.0152,
349
- "reward": 0.810416679829359,
350
- "reward_std": 0.1641346074640751,
351
- "rewards/accuracy_reward": 0.810416679829359,
352
  "rewards/format_reward": 0.0,
353
  "step": 115
354
  },
355
  {
356
  "clip_ratio": 0.0,
357
- "completion_length": 573.0354347229004,
358
  "epoch": 0.384,
359
- "grad_norm": 0.4474624693393707,
360
- "kl": 0.013144111633300782,
361
  "learning_rate": 2.3263454721781537e-06,
362
- "loss": 0.0153,
363
- "reward": 0.8104166805744171,
364
- "reward_std": 0.16061252541840076,
365
- "rewards/accuracy_reward": 0.8104166805744171,
366
  "rewards/format_reward": 0.0,
367
  "step": 120
368
  },
369
  {
370
  "clip_ratio": 0.0,
371
- "completion_length": 584.2166816711426,
372
  "epoch": 0.4,
373
- "grad_norm": 0.44476044178009033,
374
- "kl": 0.007654571533203125,
375
  "learning_rate": 2.2548531345087003e-06,
376
- "loss": 0.0265,
377
- "reward": 0.733333346247673,
378
- "reward_std": 0.15773502960801125,
379
- "rewards/accuracy_reward": 0.733333346247673,
380
  "rewards/format_reward": 0.0,
381
  "step": 125
382
  },
383
  {
384
  "clip_ratio": 0.0,
385
- "completion_length": 544.5312660217285,
386
  "epoch": 0.416,
387
- "grad_norm": 0.7776355147361755,
388
- "kl": 0.0113861083984375,
389
  "learning_rate": 2.18098574960932e-06,
390
- "loss": 0.0367,
391
- "reward": 0.7979166820645333,
392
- "reward_std": 0.17856836393475534,
393
- "rewards/accuracy_reward": 0.7979166820645333,
394
  "rewards/format_reward": 0.0,
395
  "step": 130
396
  },
397
  {
398
  "clip_ratio": 0.0,
399
- "completion_length": 588.150015258789,
400
  "epoch": 0.432,
401
- "grad_norm": 0.46490195393562317,
402
- "kl": 0.009286117553710938,
403
  "learning_rate": 2.104975731601208e-06,
404
- "loss": 0.0511,
405
- "reward": 0.6750000175088644,
406
- "reward_std": 0.19940169602632524,
407
- "rewards/accuracy_reward": 0.6750000175088644,
408
  "rewards/format_reward": 0.0,
409
  "step": 135
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
- "completion_length": 585.2770973205567,
414
  "epoch": 0.448,
415
- "grad_norm": 0.32955658435821533,
416
- "kl": 0.01014404296875,
417
  "learning_rate": 2.027062236122014e-06,
418
- "loss": 0.0294,
419
- "reward": 0.7437500163912774,
420
- "reward_std": 0.17856836318969727,
421
- "rewards/accuracy_reward": 0.7437500163912774,
422
  "rewards/format_reward": 0.0,
423
  "step": 140
424
  },
425
  {
426
  "clip_ratio": 0.0,
427
- "completion_length": 587.1312675476074,
428
  "epoch": 0.464,
429
- "grad_norm": 0.2798615097999573,
430
- "kl": 0.0207550048828125,
431
  "learning_rate": 1.9474904078537343e-06,
432
- "loss": 0.0287,
433
- "reward": 0.7145833492279052,
434
- "reward_std": 0.1734116803854704,
435
- "rewards/accuracy_reward": 0.7145833492279052,
436
  "rewards/format_reward": 0.0,
437
  "step": 145
438
  },
439
  {
440
  "clip_ratio": 0.0,
441
- "completion_length": 564.8291877746582,
442
  "epoch": 0.48,
443
- "grad_norm": 0.26126375794410706,
444
- "kl": 0.016759872436523438,
445
  "learning_rate": 1.866510609206841e-06,
446
- "loss": 0.0437,
447
- "reward": 0.7750000171363354,
448
- "reward_std": 0.17440169677138329,
449
- "rewards/accuracy_reward": 0.7750000171363354,
450
  "rewards/format_reward": 0.0,
451
  "step": 150
452
  },
453
  {
454
  "clip_ratio": 0.0,
455
- "completion_length": 617.829182434082,
456
  "epoch": 0.496,
457
- "grad_norm": 0.8329501748085022,
458
- "kl": 0.03573684692382813,
459
  "learning_rate": 1.784377632587518e-06,
460
- "loss": 0.0607,
461
- "reward": 0.7145833536982537,
462
- "reward_std": 0.21542377918958663,
463
- "rewards/accuracy_reward": 0.7145833536982537,
464
  "rewards/format_reward": 0.0,
465
  "step": 155
466
  },
467
  {
468
  "clip_ratio": 0.0,
469
- "completion_length": 652.3729393005372,
470
  "epoch": 0.512,
471
- "grad_norm": 3.987293004989624,
472
- "kl": 0.11999282836914063,
473
  "learning_rate": 1.7013498987264833e-06,
474
- "loss": 0.0785,
475
- "reward": 0.6625000197440386,
476
- "reward_std": 0.21477919332683088,
477
- "rewards/accuracy_reward": 0.6625000197440386,
478
  "rewards/format_reward": 0.0,
479
  "step": 160
480
  },
481
  {
482
  "clip_ratio": 0.0,
483
- "completion_length": 624.3125183105469,
484
  "epoch": 0.528,
485
- "grad_norm": 8.62370491027832,
486
- "kl": 0.38817138671875,
487
  "learning_rate": 1.6176886435917677e-06,
488
- "loss": 0.1121,
489
- "reward": 0.6416666841134429,
490
- "reward_std": 0.2699358768761158,
491
- "rewards/accuracy_reward": 0.6416666841134429,
492
  "rewards/format_reward": 0.0,
493
  "step": 165
494
  },
495
  {
496
  "clip_ratio": 0.0,
497
- "completion_length": 591.1229301452637,
498
  "epoch": 0.544,
499
- "grad_norm": 15.794644355773926,
500
- "kl": 0.3599708557128906,
501
  "learning_rate": 1.5336570964437077e-06,
502
- "loss": 0.0683,
503
- "reward": 0.6604166872799396,
504
- "reward_std": 0.25133545473217966,
505
- "rewards/accuracy_reward": 0.6604166872799396,
506
  "rewards/format_reward": 0.0,
507
  "step": 170
508
  },
509
  {
510
  "clip_ratio": 0.0,
511
- "completion_length": 611.1896026611328,
512
  "epoch": 0.56,
513
- "grad_norm": 41.3077507019043,
514
- "kl": 0.33219223022460936,
515
  "learning_rate": 1.4495196516183096e-06,
516
- "loss": 0.0784,
517
- "reward": 0.5979166880249978,
518
- "reward_std": 0.28243587724864483,
519
- "rewards/accuracy_reward": 0.5979166880249978,
520
  "rewards/format_reward": 0.0,
521
  "step": 175
522
  },
523
  {
524
  "clip_ratio": 0.0,
525
- "completion_length": 560.6729354858398,
526
  "epoch": 0.576,
527
- "grad_norm": 4836.41943359375,
528
- "kl": 5.374520874023437,
529
  "learning_rate": 1.3655410366448499e-06,
530
- "loss": 0.2374,
531
- "reward": 0.6708333529531956,
532
- "reward_std": 0.2580804623663425,
533
- "rewards/accuracy_reward": 0.6708333529531956,
534
  "rewards/format_reward": 0.0,
535
  "step": 180
536
  },
537
  {
538
  "clip_ratio": 0.0,
539
- "completion_length": 637.4396018981934,
540
  "epoch": 0.592,
541
- "grad_norm": 170.84182739257812,
542
- "kl": 0.36361236572265626,
543
  "learning_rate": 1.2819854793151313e-06,
544
- "loss": 0.06,
545
- "reward": 0.6416666839271784,
546
- "reward_std": 0.2599679421633482,
547
- "rewards/accuracy_reward": 0.6416666839271784,
548
  "rewards/format_reward": 0.0,
549
  "step": 185
550
  },
551
  {
552
  "clip_ratio": 0.0,
553
- "completion_length": 566.9208518981934,
554
  "epoch": 0.608,
555
- "grad_norm": 93.1268539428711,
556
- "kl": 0.42458267211914064,
557
  "learning_rate": 1.199115876325091e-06,
558
- "loss": 0.0582,
559
- "reward": 0.6875000169500709,
560
- "reward_std": 0.2580804631114006,
561
- "rewards/accuracy_reward": 0.6875000169500709,
562
  "rewards/format_reward": 0.0,
563
  "step": 190
564
  },
565
  {
566
  "clip_ratio": 0.0,
567
- "completion_length": 582.2895980834961,
568
  "epoch": 0.624,
569
- "grad_norm": 84.15328979492188,
570
- "kl": 0.388812255859375,
571
  "learning_rate": 1.1171929661045361e-06,
572
- "loss": 0.1015,
573
- "reward": 0.6979166865348816,
574
- "reward_std": 0.2670583825558424,
575
- "rewards/accuracy_reward": 0.6979166865348816,
576
  "rewards/format_reward": 0.0,
577
  "step": 195
578
  },
579
  {
580
  "epoch": 0.64,
581
- "grad_norm": 38.52534866333008,
582
  "learning_rate": 1.036474508437579e-06,
583
- "loss": 0.0775,
584
  "step": 200
585
  },
586
  {
587
  "epoch": 0.64,
588
  "eval_clip_ratio": 0.0,
589
- "eval_completion_length": 522.7839470138093,
590
- "eval_kl": 0.5824062324569611,
591
- "eval_loss": 0.07062587141990662,
592
- "eval_reward": 0.778942135279764,
593
- "eval_reward_std": 0.23596480711848436,
594
- "eval_rewards/accuracy_reward": 0.778942135279764,
595
  "eval_rewards/format_reward": 0.0,
596
- "eval_runtime": 3050.6148,
597
- "eval_samples_per_second": 0.328,
598
- "eval_steps_per_second": 0.028,
599
  "step": 200
600
  },
601
  {
602
  "clip_ratio": 0.0,
603
- "completion_length": 577.4750202178955,
604
  "epoch": 0.656,
605
- "grad_norm": 260.2613220214844,
606
- "kl": 0.7868415832519531,
607
  "learning_rate": 9.57214473454992e-07,
608
- "loss": 0.1073,
609
- "reward": 0.7072916869074106,
610
- "reward_std": 0.2551798287779093,
611
- "rewards/accuracy_reward": 0.7072916869074106,
612
  "rewards/format_reward": 0.0,
613
  "step": 205
614
  },
615
  {
616
  "clip_ratio": 0.0,
617
- "completion_length": 591.0208511352539,
618
  "epoch": 0.672,
619
- "grad_norm": 78.86795043945312,
620
- "kl": 1.4788040161132812,
621
  "learning_rate": 8.796622425502193e-07,
622
- "loss": 0.1335,
623
- "reward": 0.6562500149011612,
624
- "reward_std": 0.2276246253401041,
625
- "rewards/accuracy_reward": 0.6562500149011612,
626
  "rewards/format_reward": 0.0,
627
  "step": 210
628
  },
629
  {
630
  "clip_ratio": 0.0,
631
- "completion_length": 625.4041862487793,
632
  "epoch": 0.688,
633
- "grad_norm": 288.1315612792969,
634
- "kl": 1.1891670227050781,
635
  "learning_rate": 8.040618237332491e-07,
636
- "loss": 0.1041,
637
- "reward": 0.6583333555608988,
638
- "reward_std": 0.26160254441201686,
639
- "rewards/accuracy_reward": 0.6583333555608988,
640
  "rewards/format_reward": 0.0,
641
  "step": 215
642
  },
643
  {
644
  "clip_ratio": 0.0,
645
- "completion_length": 625.4625175476074,
646
  "epoch": 0.704,
647
- "grad_norm": 21.628314971923828,
648
- "kl": 0.9924446105957031,
649
  "learning_rate": 7.30651083891141e-07,
650
- "loss": 0.1149,
651
- "reward": 0.6895833514630795,
652
- "reward_std": 0.22668088637292386,
653
- "rewards/accuracy_reward": 0.6895833514630795,
654
  "rewards/format_reward": 0.0,
655
  "step": 220
656
  },
657
  {
658
  "clip_ratio": 0.0,
659
- "completion_length": 625.743766784668,
660
  "epoch": 0.72,
661
- "grad_norm": 14.641454696655273,
662
- "kl": 0.7305183410644531,
663
  "learning_rate": 6.596610003707959e-07,
664
- "loss": 0.0785,
665
- "reward": 0.695833345502615,
666
- "reward_std": 0.17440169751644136,
667
- "rewards/accuracy_reward": 0.695833345502615,
668
  "rewards/format_reward": 0.0,
669
  "step": 225
670
  },
671
  {
672
  "clip_ratio": 0.0,
673
- "completion_length": 654.6062652587891,
674
  "epoch": 0.736,
675
- "grad_norm": 50.088165283203125,
676
- "kl": 0.741015625,
677
  "learning_rate": 5.913149342387704e-07,
678
- "loss": 0.0796,
679
- "reward": 0.6270833522081375,
680
- "reward_std": 0.27152420245110986,
681
- "rewards/accuracy_reward": 0.6270833522081375,
682
  "rewards/format_reward": 0.0,
683
  "step": 230
684
  },
685
  {
686
  "clip_ratio": 0.0,
687
- "completion_length": 607.5229385375976,
688
  "epoch": 0.752,
689
- "grad_norm": 36.59616470336914,
690
- "kl": 0.9331382751464844,
691
  "learning_rate": 5.258279275047247e-07,
692
- "loss": 0.1222,
693
- "reward": 0.6666666839271784,
694
- "reward_std": 0.22182335667312145,
695
- "rewards/accuracy_reward": 0.6666666839271784,
696
  "rewards/format_reward": 0.0,
697
  "step": 235
698
  },
699
  {
700
  "clip_ratio": 0.0,
701
- "completion_length": 592.9187698364258,
702
  "epoch": 0.768,
703
- "grad_norm": 223.9988555908203,
704
- "kl": 0.246234130859375,
705
  "learning_rate": 4.63406026519703e-07,
706
- "loss": 0.0524,
707
- "reward": 0.6916666842997075,
708
- "reward_std": 0.2099679421633482,
709
- "rewards/accuracy_reward": 0.6916666842997075,
710
  "rewards/format_reward": 0.0,
711
  "step": 240
712
  },
713
  {
714
  "clip_ratio": 0.0,
715
- "completion_length": 621.9333503723144,
716
  "epoch": 0.784,
717
- "grad_norm": 1107.702880859375,
718
- "kl": 0.9261028289794921,
719
  "learning_rate": 4.042456336780838e-07,
720
- "loss": 0.13,
721
- "reward": 0.6875000171363354,
722
- "reward_std": 0.2305021207779646,
723
- "rewards/accuracy_reward": 0.6875000171363354,
724
  "rewards/format_reward": 0.0,
725
  "step": 245
726
  },
727
  {
728
  "clip_ratio": 0.0,
729
- "completion_length": 620.8375205993652,
730
  "epoch": 0.8,
731
- "grad_norm": 90.20494079589844,
732
- "kl": 2.3170936584472654,
733
  "learning_rate": 3.4853288946298335e-07,
734
- "loss": 0.1783,
735
- "reward": 0.6333333445712924,
736
- "reward_std": 0.2817912921309471,
737
- "rewards/accuracy_reward": 0.6333333445712924,
738
  "rewards/format_reward": 0.0,
739
  "step": 250
740
  },
741
  {
742
  "clip_ratio": 0.0,
743
- "completion_length": 596.6895973205567,
744
  "epoch": 0.816,
745
- "grad_norm": 27.478755950927734,
746
- "kl": 1.1328254699707032,
747
  "learning_rate": 2.9644308677943315e-07,
748
- "loss": 0.1288,
749
- "reward": 0.6791666857898235,
750
- "reward_std": 0.2638354554772377,
751
- "rewards/accuracy_reward": 0.6791666857898235,
752
  "rewards/format_reward": 0.0,
753
  "step": 255
754
  },
755
  {
756
  "clip_ratio": 0.0,
757
- "completion_length": 616.3125152587891,
758
  "epoch": 0.832,
759
- "grad_norm": 17.04458236694336,
760
- "kl": 0.6115745544433594,
761
  "learning_rate": 2.48140119418046e-07,
762
- "loss": 0.0903,
763
- "reward": 0.6937500171363353,
764
- "reward_std": 0.26671295091509817,
765
- "rewards/accuracy_reward": 0.6937500171363353,
766
  "rewards/format_reward": 0.0,
767
  "step": 260
768
  },
769
  {
770
  "clip_ratio": 0.0,
771
- "completion_length": 632.5791809082032,
772
  "epoch": 0.848,
773
- "grad_norm": 49.66255187988281,
774
- "kl": 0.5178520202636718,
775
  "learning_rate": 2.0377596638451812e-07,
776
- "loss": 0.0998,
777
- "reward": 0.6895833522081375,
778
- "reward_std": 0.23819086849689483,
779
- "rewards/accuracy_reward": 0.6895833522081375,
780
  "rewards/format_reward": 0.0,
781
  "step": 265
782
  },
783
  {
784
  "clip_ratio": 0.0,
785
- "completion_length": 616.1958518981934,
786
  "epoch": 0.864,
787
- "grad_norm": 109.28793334960938,
788
- "kl": 2.1872650146484376,
789
  "learning_rate": 1.634902137174483e-07,
790
- "loss": 0.1276,
791
- "reward": 0.675000019185245,
792
- "reward_std": 0.22921295054256915,
793
- "rewards/accuracy_reward": 0.675000019185245,
794
  "rewards/format_reward": 0.0,
795
  "step": 270
796
  },
797
  {
798
  "clip_ratio": 0.0,
799
- "completion_length": 605.8229377746582,
800
  "epoch": 0.88,
801
- "grad_norm": 27.828811645507812,
802
- "kl": 0.8389869689941406,
803
  "learning_rate": 1.274096152990203e-07,
804
- "loss": 0.1027,
805
- "reward": 0.7250000169500709,
806
- "reward_std": 0.2292129497975111,
807
- "rewards/accuracy_reward": 0.7250000169500709,
808
  "rewards/format_reward": 0.0,
809
  "step": 275
810
  },
811
  {
812
  "clip_ratio": 0.0,
813
- "completion_length": 608.7375160217285,
814
  "epoch": 0.896,
815
- "grad_norm": 7.841161251068115,
816
- "kl": 0.7010269165039062,
817
  "learning_rate": 9.564769404039419e-08,
818
- "loss": 0.0979,
819
- "reward": 0.7083333484828472,
820
- "reward_std": 0.22087961621582508,
821
- "rewards/accuracy_reward": 0.7083333484828472,
822
  "rewards/format_reward": 0.0,
823
  "step": 280
824
  },
825
  {
826
  "clip_ratio": 0.0,
827
- "completion_length": 630.8354316711426,
828
  "epoch": 0.912,
829
- "grad_norm": 27.939838409423828,
830
- "kl": 1.3990249633789062,
831
  "learning_rate": 6.830438469662892e-08,
832
- "loss": 0.1243,
833
- "reward": 0.6291666835546493,
834
- "reward_std": 0.27216878831386565,
835
- "rewards/accuracy_reward": 0.6291666835546493,
836
  "rewards/format_reward": 0.0,
837
  "step": 285
838
  },
839
  {
840
  "clip_ratio": 0.0,
841
- "completion_length": 651.3729393005372,
842
  "epoch": 0.928,
843
- "grad_norm": 12.43730354309082,
844
- "kl": 0.5821548461914062,
845
  "learning_rate": 4.546571943496969e-08,
846
- "loss": 0.1056,
847
- "reward": 0.666666685603559,
848
- "reward_std": 0.2651246260851622,
849
- "rewards/accuracy_reward": 0.666666685603559,
850
  "rewards/format_reward": 0.0,
851
  "step": 290
852
  },
853
  {
854
  "clip_ratio": 0.0,
855
- "completion_length": 640.3312721252441,
856
  "epoch": 0.944,
857
- "grad_norm": 57.05141067504883,
858
- "kl": 0.9673851013183594,
859
  "learning_rate": 2.72035571458224e-08,
860
- "loss": 0.1247,
861
- "reward": 0.6854166865348816,
862
- "reward_std": 0.2596687875688076,
863
- "rewards/accuracy_reward": 0.6854166865348816,
864
  "rewards/format_reward": 0.0,
865
  "step": 295
866
  },
867
  {
868
  "epoch": 0.96,
869
- "grad_norm": 16.82033920288086,
870
  "learning_rate": 1.357535734809795e-08,
871
- "loss": 0.1374,
872
  "step": 300
873
  },
874
  {
875
  "epoch": 0.96,
876
  "eval_clip_ratio": 0.0,
877
- "eval_completion_length": 555.4643360983112,
878
- "eval_kl": 0.9598001263098802,
879
- "eval_loss": 0.11535439640283585,
880
- "eval_reward": 0.7320359474839921,
881
- "eval_reward_std": 0.2536725497352863,
882
- "eval_rewards/accuracy_reward": 0.7320359474839921,
883
  "eval_rewards/format_reward": 0.0,
884
- "eval_runtime": 3149.9691,
885
- "eval_samples_per_second": 0.317,
886
- "eval_steps_per_second": 0.027,
887
  "step": 300
888
  },
889
  {
890
  "clip_ratio": 0.0,
891
- "completion_length": 609.3489749908447,
892
  "epoch": 0.976,
893
- "grad_norm": 13.1323881149292,
894
- "kl": 1.1041545867919922,
895
  "learning_rate": 4.623999400308054e-09,
896
- "loss": 0.1097,
897
- "reward": 0.6729166850447654,
898
- "reward_std": 0.23466878831386567,
899
- "rewards/accuracy_reward": 0.6729166850447654,
900
  "rewards/format_reward": 0.0,
901
  "step": 305
902
  },
903
  {
904
  "clip_ratio": 0.0,
905
- "completion_length": 621.8541877746582,
906
  "epoch": 0.992,
907
- "grad_norm": 17.46805763244629,
908
- "kl": 0.6278785705566406,
909
  "learning_rate": 3.77647586240204e-10,
910
- "loss": 0.115,
911
- "reward": 0.6645833529531956,
912
- "reward_std": 0.2218696340918541,
913
- "rewards/accuracy_reward": 0.6645833529531956,
914
  "rewards/format_reward": 0.0,
915
  "step": 310
916
  },
917
  {
918
  "clip_ratio": 0.0,
919
- "completion_length": 679.5989780426025,
920
  "epoch": 0.9984,
921
- "kl": 0.9321136474609375,
922
- "reward": 0.6093750093132257,
923
- "reward_std": 0.21961358468979597,
924
- "rewards/accuracy_reward": 0.6093750093132257,
925
  "rewards/format_reward": 0.0,
926
  "step": 312,
927
  "total_flos": 0.0,
928
- "train_loss": 0.06954928480375272,
929
- "train_runtime": 34706.12,
930
- "train_samples_per_second": 0.216,
931
- "train_steps_per_second": 0.009
932
  }
933
  ],
934
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "clip_ratio": 0.0,
13
+ "completion_length": 584.6666870117188,
14
  "epoch": 0.0032,
15
+ "grad_norm": 0.10400390625,
16
  "kl": 0.0,
17
  "learning_rate": 9.375e-08,
18
+ "loss": 0.0003,
19
+ "reward": 0.802083358168602,
20
+ "reward_std": 0.17633545212447643,
21
+ "rewards/accuracy_reward": 0.802083358168602,
22
  "rewards/format_reward": 0.0,
23
  "step": 1
24
  },
25
  {
26
  "clip_ratio": 0.0,
27
+ "completion_length": 571.5573091506958,
28
  "epoch": 0.016,
29
+ "grad_norm": 0.06396484375,
30
+ "kl": 7.958153128129197e-05,
31
  "learning_rate": 4.6875e-07,
32
+ "loss": 0.0157,
33
+ "reward": 0.7526041800156236,
34
+ "reward_std": 0.13064012676477432,
35
+ "rewards/accuracy_reward": 0.7526041800156236,
36
  "rewards/format_reward": 0.0,
37
  "step": 5
38
  },
39
  {
40
  "clip_ratio": 0.0,
41
+ "completion_length": 543.472933959961,
42
  "epoch": 0.032,
43
+ "grad_norm": 0.11181640625,
44
+ "kl": 0.00010847050416487036,
45
  "learning_rate": 9.375e-07,
46
+ "loss": 0.0178,
47
+ "reward": 0.7562500115483999,
48
+ "reward_std": 0.12246793955564499,
49
+ "rewards/accuracy_reward": 0.7562500115483999,
50
  "rewards/format_reward": 0.0,
51
  "step": 10
52
  },
53
  {
54
  "clip_ratio": 0.0,
55
+ "completion_length": 561.6458534240722,
56
  "epoch": 0.048,
57
+ "grad_norm": 0.12890625,
58
+ "kl": 0.0001124277588132827,
59
  "learning_rate": 1.40625e-06,
60
+ "loss": 0.0289,
61
+ "reward": 0.7354166857898236,
62
+ "reward_std": 0.1497008502483368,
63
+ "rewards/accuracy_reward": 0.7354166857898236,
64
  "rewards/format_reward": 0.0,
65
  "step": 15
66
  },
67
  {
68
  "clip_ratio": 0.0,
69
+ "completion_length": 587.439599609375,
70
  "epoch": 0.064,
71
+ "grad_norm": 0.060546875,
72
+ "kl": 0.0001041516380610119,
73
  "learning_rate": 1.875e-06,
74
+ "loss": 0.03,
75
+ "reward": 0.7645833499729633,
76
+ "reward_std": 0.14488959796726703,
77
+ "rewards/accuracy_reward": 0.7645833499729633,
78
  "rewards/format_reward": 0.0,
79
  "step": 20
80
  },
81
  {
82
  "clip_ratio": 0.0,
83
+ "completion_length": 542.566683959961,
84
  "epoch": 0.08,
85
+ "grad_norm": 0.11083984375,
86
+ "kl": 0.00010825455901795067,
87
  "learning_rate": 2.3437500000000002e-06,
88
+ "loss": 0.0144,
89
+ "reward": 0.7187500149011612,
90
+ "reward_std": 0.12599002085626126,
91
+ "rewards/accuracy_reward": 0.7187500149011612,
92
  "rewards/format_reward": 0.0,
93
  "step": 25
94
  },
95
  {
96
  "clip_ratio": 0.0,
97
+ "completion_length": 528.8000137329102,
98
  "epoch": 0.096,
99
+ "grad_norm": 0.08984375,
100
+ "kl": 0.00010333679629184189,
101
  "learning_rate": 2.8125e-06,
102
+ "loss": 0.0131,
103
+ "reward": 0.7687500163912773,
104
+ "reward_std": 0.11284543462097645,
105
+ "rewards/accuracy_reward": 0.7687500163912773,
106
  "rewards/format_reward": 0.0,
107
  "step": 30
108
  },
109
  {
110
  "clip_ratio": 0.0,
111
+ "completion_length": 554.2833518981934,
112
  "epoch": 0.112,
113
+ "grad_norm": 0.134765625,
114
+ "kl": 0.00010024149205491994,
115
  "learning_rate": 2.9991503375003e-06,
116
+ "loss": 0.0195,
117
+ "reward": 0.7666666831821203,
118
+ "reward_std": 0.17534543611109257,
119
+ "rewards/accuracy_reward": 0.7666666831821203,
120
  "rewards/format_reward": 0.0,
121
  "step": 35
122
  },
123
  {
124
  "clip_ratio": 0.0,
125
+ "completion_length": 575.7479362487793,
126
  "epoch": 0.128,
127
+ "grad_norm": 0.1142578125,
128
+ "kl": 0.000104643427857809,
129
  "learning_rate": 2.993961440992859e-06,
130
+ "loss": 0.0159,
131
+ "reward": 0.7541666865348816,
132
+ "reward_std": 0.1634900216013193,
133
+ "rewards/accuracy_reward": 0.7541666865348816,
134
  "rewards/format_reward": 0.0,
135
  "step": 40
136
  },
137
  {
138
  "clip_ratio": 0.0,
139
+ "completion_length": 557.6687637329102,
140
  "epoch": 0.144,
141
+ "grad_norm": 0.11474609375,
142
+ "kl": 0.00010511839755054098,
143
  "learning_rate": 2.984071989079555e-06,
144
+ "loss": 0.0248,
145
+ "reward": 0.733333352021873,
146
+ "reward_std": 0.16830127350986004,
147
+ "rewards/accuracy_reward": 0.733333352021873,
148
  "rewards/format_reward": 0.0,
149
  "step": 45
150
  },
151
  {
152
  "clip_ratio": 0.0,
153
+ "completion_length": 550.9521041870117,
154
  "epoch": 0.16,
155
+ "grad_norm": 0.14453125,
156
+ "kl": 0.0001057840139765176,
157
  "learning_rate": 2.9695130976348534e-06,
158
+ "loss": 0.0072,
159
+ "reward": 0.7645833477377891,
160
+ "reward_std": 0.12599002085626126,
161
+ "rewards/accuracy_reward": 0.7645833477377891,
162
  "rewards/format_reward": 0.0,
163
  "step": 50
164
  },
165
  {
166
  "clip_ratio": 0.0,
167
+ "completion_length": 548.1979293823242,
168
  "epoch": 0.176,
169
+ "grad_norm": 0.1220703125,
170
+ "kl": 0.00011230868503844249,
171
  "learning_rate": 2.9503305743175096e-06,
172
+ "loss": 0.0145,
173
+ "reward": 0.7333333499729633,
174
+ "reward_std": 0.1445904441177845,
175
+ "rewards/accuracy_reward": 0.7333333499729633,
176
  "rewards/format_reward": 0.0,
177
  "step": 55
178
  },
179
  {
180
  "clip_ratio": 0.0,
181
+ "completion_length": 571.2687675476075,
182
  "epoch": 0.192,
183
+ "grad_norm": 0.000701904296875,
184
+ "kl": 9.881708028842695e-05,
185
  "learning_rate": 2.9265847744427307e-06,
186
+ "loss": 0.014,
187
+ "reward": 0.8062500104308128,
188
+ "reward_std": 0.11636751629412175,
189
+ "rewards/accuracy_reward": 0.8062500104308128,
190
  "rewards/format_reward": 0.0,
191
  "step": 60
192
  },
193
  {
194
  "clip_ratio": 0.0,
195
+ "completion_length": 516.7791831970214,
196
  "epoch": 0.208,
197
+ "grad_norm": 0.08544921875,
198
+ "kl": 0.00010831438612513011,
199
  "learning_rate": 2.8983504110820214e-06,
200
+ "loss": 0.0154,
201
+ "reward": 0.8041666813194752,
202
+ "reward_std": 0.11477919146418572,
203
+ "rewards/accuracy_reward": 0.8041666813194752,
204
  "rewards/format_reward": 0.0,
205
  "step": 65
206
  },
207
  {
208
  "clip_ratio": 0.0,
209
+ "completion_length": 521.7562652587891,
210
  "epoch": 0.224,
211
+ "grad_norm": 0.06982421875,
212
+ "kl": 0.00010487217359695933,
213
  "learning_rate": 2.865716319988224e-06,
214
+ "loss": 0.0319,
215
+ "reward": 0.8083333514630795,
216
+ "reward_std": 0.11959044374525547,
217
+ "rewards/accuracy_reward": 0.8083333514630795,
218
  "rewards/format_reward": 0.0,
219
  "step": 70
220
  },
221
  {
222
  "clip_ratio": 0.0,
223
+ "completion_length": 558.2396018981933,
224
  "epoch": 0.24,
225
+ "grad_norm": 0.1142578125,
226
+ "kl": 0.00011459490733614074,
227
  "learning_rate": 2.82878518008537e-06,
228
+ "loss": 0.0156,
229
+ "reward": 0.7729166850447655,
230
+ "reward_std": 0.130801273137331,
231
+ "rewards/accuracy_reward": 0.7729166850447655,
232
  "rewards/format_reward": 0.0,
233
  "step": 75
234
  },
235
  {
236
  "clip_ratio": 0.0,
237
+ "completion_length": 554.2271026611328,
238
  "epoch": 0.256,
239
+ "grad_norm": 0.140625,
240
+ "kl": 0.00010408069401819375,
241
  "learning_rate": 2.7876731904027993e-06,
242
+ "loss": 0.0213,
243
+ "reward": 0.7583333492279053,
244
+ "reward_std": 0.1705341838300228,
245
+ "rewards/accuracy_reward": 0.7583333492279053,
246
  "rewards/format_reward": 0.0,
247
  "step": 80
248
  },
249
  {
250
  "clip_ratio": 0.0,
251
+ "completion_length": 520.0208503723145,
252
  "epoch": 0.272,
253
+ "grad_norm": 0.11474609375,
254
+ "kl": 0.00011381399317542674,
255
  "learning_rate": 2.7425097044700246e-06,
256
+ "loss": 0.0105,
257
+ "reward": 0.8062500104308128,
258
+ "reward_std": 0.1369016956537962,
259
+ "rewards/accuracy_reward": 0.8062500104308128,
260
  "rewards/format_reward": 0.0,
261
  "step": 85
262
  },
263
  {
264
  "clip_ratio": 0.0,
265
+ "completion_length": 566.3708511352539,
266
  "epoch": 0.288,
267
+ "grad_norm": 0.08154296875,
268
+ "kl": 0.00011387564354663482,
269
  "learning_rate": 2.6934368233226715e-06,
270
+ "loss": 0.0226,
271
+ "reward": 0.7479166813194752,
272
+ "reward_std": 0.11413460597395897,
273
+ "rewards/accuracy_reward": 0.7479166813194752,
274
  "rewards/format_reward": 0.0,
275
  "step": 90
276
  },
277
  {
278
  "clip_ratio": 0.0,
279
+ "completion_length": 544.3666809082031,
280
  "epoch": 0.304,
281
+ "grad_norm": 0.109375,
282
+ "kl": 0.00010668279064702801,
283
  "learning_rate": 2.6406089484000465e-06,
284
+ "loss": 0.018,
285
+ "reward": 0.812500013411045,
286
+ "reward_std": 0.12886751629412174,
287
+ "rewards/accuracy_reward": 0.812500013411045,
288
  "rewards/format_reward": 0.0,
289
  "step": 95
290
  },
291
  {
292
  "epoch": 0.32,
293
+ "grad_norm": 0.07763671875,
294
  "learning_rate": 2.584192295741087e-06,
295
+ "loss": 0.022,
296
  "step": 100
297
  },
298
  {
299
  "epoch": 0.32,
300
  "eval_clip_ratio": 0.0,
301
+ "eval_completion_length": 465.4548533091288,
302
+ "eval_kl": 9.347488428797144e-05,
303
+ "eval_loss": 0.0055223857052624226,
304
+ "eval_reward": 0.8567864413032988,
305
+ "eval_reward_std": 0.12574039105169788,
306
+ "eval_rewards/accuracy_reward": 0.8567864413032988,
307
  "eval_rewards/format_reward": 0.0,
308
+ "eval_runtime": 1169.6676,
309
+ "eval_samples_per_second": 0.855,
310
+ "eval_steps_per_second": 0.072,
311
  "step": 100
312
  },
313
  {
314
  "clip_ratio": 0.0,
315
+ "completion_length": 532.1687637329102,
316
  "epoch": 0.336,
317
+ "grad_norm": 0.035400390625,
318
+ "kl": 0.0001123551235650666,
319
  "learning_rate": 2.5243643730072105e-06,
320
+ "loss": 0.0119,
321
+ "reward": 0.7843750141561031,
322
+ "reward_std": 0.13305732235312462,
323
+ "rewards/accuracy_reward": 0.7843750141561031,
324
  "rewards/format_reward": 0.0,
325
  "step": 105
326
  },
327
  {
328
  "clip_ratio": 0.0,
329
+ "completion_length": 564.0521003723145,
330
  "epoch": 0.352,
331
+ "grad_norm": 0.095703125,
332
+ "kl": 0.00012054569197061937,
333
  "learning_rate": 2.461313420977536e-06,
334
+ "loss": 0.0074,
335
+ "reward": 0.733333346247673,
336
+ "reward_std": 0.13496793992817402,
337
+ "rewards/accuracy_reward": 0.733333346247673,
338
  "rewards/format_reward": 0.0,
339
  "step": 110
340
  },
341
  {
342
  "clip_ratio": 0.0,
343
+ "completion_length": 550.6270965576172,
344
  "epoch": 0.368,
345
+ "grad_norm": 0.07373046875,
346
+ "kl": 0.00011242426007811446,
347
  "learning_rate": 2.3952378212737554e-06,
348
+ "loss": 0.0106,
349
+ "reward": 0.8083333492279052,
350
+ "reward_std": 0.12182335406541825,
351
+ "rewards/accuracy_reward": 0.8083333492279052,
352
  "rewards/format_reward": 0.0,
353
  "step": 115
354
  },
355
  {
356
  "clip_ratio": 0.0,
357
+ "completion_length": 558.2479309082031,
358
  "epoch": 0.384,
359
+ "grad_norm": 0.11181640625,
360
+ "kl": 0.0001131312132201856,
361
  "learning_rate": 2.3263454721781537e-06,
362
+ "loss": 0.009,
363
+ "reward": 0.7833333492279053,
364
+ "reward_std": 0.13015668764710425,
365
+ "rewards/accuracy_reward": 0.7833333492279053,
366
  "rewards/format_reward": 0.0,
367
  "step": 120
368
  },
369
  {
370
  "clip_ratio": 0.0,
371
+ "completion_length": 563.1083541870117,
372
  "epoch": 0.4,
373
+ "grad_norm": 0.12255859375,
374
+ "kl": 0.00012562979900394567,
375
  "learning_rate": 2.2548531345087003e-06,
376
+ "loss": 0.0248,
377
+ "reward": 0.7270833514630795,
378
+ "reward_std": 0.16413460709154606,
379
+ "rewards/accuracy_reward": 0.7270833514630795,
380
  "rewards/format_reward": 0.0,
381
  "step": 125
382
  },
383
  {
384
  "clip_ratio": 0.0,
385
+ "completion_length": 524.6166809082031,
386
  "epoch": 0.416,
387
+ "grad_norm": 0.0966796875,
388
+ "kl": 0.00011902451351488707,
389
  "learning_rate": 2.18098574960932e-06,
390
+ "loss": 0.0201,
391
+ "reward": 0.812500013411045,
392
+ "reward_std": 0.1340242002159357,
393
+ "rewards/accuracy_reward": 0.812500013411045,
394
  "rewards/format_reward": 0.0,
395
  "step": 130
396
  },
397
  {
398
  "clip_ratio": 0.0,
399
+ "completion_length": 579.4187637329102,
400
  "epoch": 0.432,
401
+ "grad_norm": 0.140625,
402
+ "kl": 0.00013472076934704092,
403
  "learning_rate": 2.104975731601208e-06,
404
+ "loss": 0.0297,
405
+ "reward": 0.6520833492279052,
406
+ "reward_std": 0.1846687864512205,
407
+ "rewards/accuracy_reward": 0.6520833492279052,
408
  "rewards/format_reward": 0.0,
409
  "step": 135
410
  },
411
  {
412
  "clip_ratio": 0.0,
413
+ "completion_length": 561.5541854858399,
414
  "epoch": 0.448,
415
+ "grad_norm": 0.09326171875,
416
+ "kl": 0.00012283536361792357,
417
  "learning_rate": 2.027062236122014e-06,
418
+ "loss": 0.0136,
419
+ "reward": 0.762500012665987,
420
+ "reward_std": 0.14682335481047631,
421
+ "rewards/accuracy_reward": 0.762500012665987,
422
  "rewards/format_reward": 0.0,
423
  "step": 140
424
  },
425
  {
426
  "clip_ratio": 0.0,
427
+ "completion_length": 538.0895980834961,
428
  "epoch": 0.464,
429
+ "grad_norm": 0.07861328125,
430
+ "kl": 0.00012730688667943467,
431
  "learning_rate": 1.9474904078537343e-06,
432
+ "loss": 0.0143,
433
+ "reward": 0.7520833484828472,
434
+ "reward_std": 0.1176566869020462,
435
+ "rewards/accuracy_reward": 0.7520833484828472,
436
  "rewards/format_reward": 0.0,
437
  "step": 145
438
  },
439
  {
440
  "clip_ratio": 0.0,
441
+ "completion_length": 550.629182434082,
442
  "epoch": 0.48,
443
+ "grad_norm": 0.1259765625,
444
+ "kl": 0.00011251820915276766,
445
  "learning_rate": 1.866510609206841e-06,
446
+ "loss": 0.0182,
447
+ "reward": 0.7520833499729633,
448
+ "reward_std": 0.13690169639885424,
449
+ "rewards/accuracy_reward": 0.7520833499729633,
450
  "rewards/format_reward": 0.0,
451
  "step": 150
452
  },
453
  {
454
  "clip_ratio": 0.0,
455
+ "completion_length": 566.2791816711426,
456
  "epoch": 0.496,
457
+ "grad_norm": 0.080078125,
458
+ "kl": 0.00012192594749649288,
459
  "learning_rate": 1.784377632587518e-06,
460
+ "loss": 0.0135,
461
+ "reward": 0.7062500165775418,
462
+ "reward_std": 0.1689458593726158,
463
+ "rewards/accuracy_reward": 0.7062500165775418,
464
  "rewards/format_reward": 0.0,
465
  "step": 155
466
  },
467
  {
468
  "clip_ratio": 0.0,
469
+ "completion_length": 596.4979377746582,
470
  "epoch": 0.512,
471
+ "grad_norm": 0.1044921875,
472
+ "kl": 0.00012320739442657215,
473
  "learning_rate": 1.7013498987264833e-06,
474
+ "loss": 0.0214,
475
+ "reward": 0.7312500171363354,
476
+ "reward_std": 0.139134606346488,
477
+ "rewards/accuracy_reward": 0.7312500171363354,
478
  "rewards/format_reward": 0.0,
479
  "step": 160
480
  },
481
  {
482
  "clip_ratio": 0.0,
483
+ "completion_length": 546.1958457946778,
484
  "epoch": 0.528,
485
+ "grad_norm": 0.11083984375,
486
+ "kl": 0.00013099992102070246,
487
  "learning_rate": 1.6176886435917677e-06,
488
+ "loss": 0.0082,
489
+ "reward": 0.7520833477377892,
490
+ "reward_std": 0.1285683624446392,
491
+ "rewards/accuracy_reward": 0.7520833477377892,
492
  "rewards/format_reward": 0.0,
493
  "step": 165
494
  },
495
  {
496
  "clip_ratio": 0.0,
497
+ "completion_length": 559.1458511352539,
498
  "epoch": 0.544,
499
+ "grad_norm": 0.08935546875,
500
+ "kl": 0.00012905245066576753,
501
  "learning_rate": 1.5336570964437077e-06,
502
+ "loss": 0.0157,
503
+ "reward": 0.7000000156462193,
504
+ "reward_std": 0.15644585900008678,
505
+ "rewards/accuracy_reward": 0.7000000156462193,
506
  "rewards/format_reward": 0.0,
507
  "step": 170
508
  },
509
  {
510
  "clip_ratio": 0.0,
511
+ "completion_length": 554.4812644958496,
512
  "epoch": 0.56,
513
+ "grad_norm": 0.1279296875,
514
+ "kl": 0.0001381170730383019,
515
  "learning_rate": 1.4495196516183096e-06,
516
+ "loss": 0.0115,
517
+ "reward": 0.7479166779667139,
518
+ "reward_std": 0.139134606346488,
519
+ "rewards/accuracy_reward": 0.7479166779667139,
520
  "rewards/format_reward": 0.0,
521
  "step": 175
522
  },
523
  {
524
  "clip_ratio": 0.0,
525
+ "completion_length": 540.095849609375,
526
  "epoch": 0.576,
527
+ "grad_norm": 0.07275390625,
528
+ "kl": 0.00012737434626615142,
529
  "learning_rate": 1.3655410366448499e-06,
530
+ "loss": 0.0079,
531
+ "reward": 0.7666666805744171,
532
+ "reward_std": 0.13496793992817402,
533
+ "rewards/accuracy_reward": 0.7666666805744171,
534
  "rewards/format_reward": 0.0,
535
  "step": 180
536
  },
537
  {
538
  "clip_ratio": 0.0,
539
+ "completion_length": 599.0208534240722,
540
  "epoch": 0.592,
541
+ "grad_norm": 0.07958984375,
542
+ "kl": 0.0001210577192978235,
543
  "learning_rate": 1.2819854793151313e-06,
544
+ "loss": 0.0137,
545
+ "reward": 0.7145833492279052,
546
+ "reward_std": 0.1702350303530693,
547
+ "rewards/accuracy_reward": 0.7145833492279052,
548
  "rewards/format_reward": 0.0,
549
  "step": 185
550
  },
551
  {
552
  "clip_ratio": 0.0,
553
+ "completion_length": 557.2291839599609,
554
  "epoch": 0.608,
555
+ "grad_norm": 0.1552734375,
556
+ "kl": 0.00014638212196587118,
557
  "learning_rate": 1.199115876325091e-06,
558
+ "loss": 0.0123,
559
+ "reward": 0.7354166839271784,
560
+ "reward_std": 0.15932335443794726,
561
+ "rewards/accuracy_reward": 0.7354166839271784,
562
  "rewards/format_reward": 0.0,
563
  "step": 190
564
  },
565
  {
566
  "clip_ratio": 0.0,
567
+ "completion_length": 554.1521011352539,
568
  "epoch": 0.624,
569
+ "grad_norm": 0.126953125,
570
+ "kl": 0.0001363154207865591,
571
  "learning_rate": 1.1171929661045361e-06,
572
+ "loss": 0.0254,
573
+ "reward": 0.7708333477377891,
574
+ "reward_std": 0.15292377695441245,
575
+ "rewards/accuracy_reward": 0.7708333477377891,
576
  "rewards/format_reward": 0.0,
577
  "step": 195
578
  },
579
  {
580
  "epoch": 0.64,
581
+ "grad_norm": 0.07763671875,
582
  "learning_rate": 1.036474508437579e-06,
583
+ "loss": 0.0143,
584
  "step": 200
585
  },
586
  {
587
  "epoch": 0.64,
588
  "eval_clip_ratio": 0.0,
589
+ "eval_completion_length": 465.75450476914824,
590
+ "eval_kl": 0.00010685787577506903,
591
+ "eval_loss": 0.005738089792430401,
592
+ "eval_reward": 0.8522954227146274,
593
+ "eval_reward_std": 0.13115221712582126,
594
+ "eval_rewards/accuracy_reward": 0.8522954227146274,
595
  "eval_rewards/format_reward": 0.0,
596
+ "eval_runtime": 1182.5904,
597
+ "eval_samples_per_second": 0.846,
598
+ "eval_steps_per_second": 0.071,
599
  "step": 200
600
  },
601
  {
602
  "clip_ratio": 0.0,
603
+ "completion_length": 542.3031398773194,
604
  "epoch": 0.656,
605
+ "grad_norm": 0.11962890625,
606
+ "kl": 0.00013577902191173053,
607
  "learning_rate": 9.57214473454992e-07,
608
+ "loss": 0.0116,
609
+ "reward": 0.7760416804812849,
610
+ "reward_std": 0.1375231422483921,
611
+ "rewards/accuracy_reward": 0.7760416804812849,
612
  "rewards/format_reward": 0.0,
613
  "step": 205
614
  },
615
  {
616
  "clip_ratio": 0.0,
617
+ "completion_length": 555.0937683105469,
618
  "epoch": 0.672,
619
+ "grad_norm": 0.0693359375,
620
+ "kl": 0.0001307015376369236,
621
  "learning_rate": 8.796622425502193e-07,
622
+ "loss": 0.0163,
623
+ "reward": 0.7687500111758709,
624
+ "reward_std": 0.11731125563383102,
625
+ "rewards/accuracy_reward": 0.7687500111758709,
626
  "rewards/format_reward": 0.0,
627
  "step": 210
628
  },
629
  {
630
  "clip_ratio": 0.0,
631
+ "completion_length": 592.2062675476075,
632
  "epoch": 0.688,
633
+ "grad_norm": 0.10693359375,
634
+ "kl": 0.00013232407709438121,
635
  "learning_rate": 8.040618237332491e-07,
636
+ "loss": 0.0397,
637
+ "reward": 0.7083333488553762,
638
+ "reward_std": 0.17921294905245305,
639
+ "rewards/accuracy_reward": 0.7083333488553762,
640
  "rewards/format_reward": 0.0,
641
  "step": 215
642
  },
643
  {
644
  "clip_ratio": 0.0,
645
+ "completion_length": 563.9937641143799,
646
  "epoch": 0.704,
647
+ "grad_norm": 0.0947265625,
648
+ "kl": 0.000132297438904061,
649
  "learning_rate": 7.30651083891141e-07,
650
+ "loss": 0.0267,
651
+ "reward": 0.7666666850447654,
652
+ "reward_std": 0.14811252541840075,
653
+ "rewards/accuracy_reward": 0.7666666850447654,
654
  "rewards/format_reward": 0.0,
655
  "step": 220
656
  },
657
  {
658
  "clip_ratio": 0.0,
659
+ "completion_length": 584.647932434082,
660
  "epoch": 0.72,
661
+ "grad_norm": 0.0947265625,
662
+ "kl": 0.00012981849013158354,
663
  "learning_rate": 6.596610003707959e-07,
664
+ "loss": 0.0189,
665
+ "reward": 0.75416667945683,
666
+ "reward_std": 0.1494016967713833,
667
+ "rewards/accuracy_reward": 0.75416667945683,
668
  "rewards/format_reward": 0.0,
669
  "step": 225
670
  },
671
  {
672
  "clip_ratio": 0.0,
673
+ "completion_length": 599.6791854858399,
674
  "epoch": 0.736,
675
+ "grad_norm": 0.13671875,
676
+ "kl": 0.00013120998573867837,
677
  "learning_rate": 5.913149342387704e-07,
678
+ "loss": 0.0238,
679
+ "reward": 0.695833346620202,
680
+ "reward_std": 0.20069086775183678,
681
+ "rewards/accuracy_reward": 0.695833346620202,
682
  "rewards/format_reward": 0.0,
683
  "step": 230
684
  },
685
  {
686
  "clip_ratio": 0.0,
687
+ "completion_length": 545.3291854858398,
688
  "epoch": 0.752,
689
+ "grad_norm": 0.11767578125,
690
+ "kl": 0.00013769497109024088,
691
  "learning_rate": 5.258279275047247e-07,
692
+ "loss": 0.0255,
693
+ "reward": 0.7562500152736902,
694
+ "reward_std": 0.1404237776994705,
695
+ "rewards/accuracy_reward": 0.7562500152736902,
696
  "rewards/format_reward": 0.0,
697
  "step": 235
698
  },
699
  {
700
  "clip_ratio": 0.0,
701
+ "completion_length": 547.9833534240722,
702
  "epoch": 0.768,
703
+ "grad_norm": 0.07666015625,
704
+ "kl": 0.00013748378478339873,
705
  "learning_rate": 4.63406026519703e-07,
706
+ "loss": 0.0161,
707
+ "reward": 0.7562500193715096,
708
+ "reward_std": 0.130801273137331,
709
+ "rewards/accuracy_reward": 0.7562500193715096,
710
  "rewards/format_reward": 0.0,
711
  "step": 240
712
  },
713
  {
714
  "clip_ratio": 0.0,
715
+ "completion_length": 571.7437660217286,
716
  "epoch": 0.784,
717
+ "grad_norm": 0.11376953125,
718
+ "kl": 0.0001361471262498526,
719
  "learning_rate": 4.042456336780838e-07,
720
+ "loss": 0.0177,
721
+ "reward": 0.7125000152736902,
722
+ "reward_std": 0.1445904441177845,
723
+ "rewards/accuracy_reward": 0.7125000152736902,
724
  "rewards/format_reward": 0.0,
725
  "step": 245
726
  },
727
  {
728
  "clip_ratio": 0.0,
729
+ "completion_length": 568.1041801452636,
730
  "epoch": 0.8,
731
+ "grad_norm": 0.09716796875,
732
+ "kl": 0.00013276812351250555,
733
  "learning_rate": 3.4853288946298335e-07,
734
+ "loss": 0.0167,
735
+ "reward": 0.7312500160187483,
736
+ "reward_std": 0.1497008502483368,
737
+ "rewards/accuracy_reward": 0.7312500160187483,
738
  "rewards/format_reward": 0.0,
739
  "step": 250
740
  },
741
  {
742
  "clip_ratio": 0.0,
743
+ "completion_length": 544.4604331970215,
744
  "epoch": 0.816,
745
+ "grad_norm": 0.11474609375,
746
+ "kl": 0.00013813894056511345,
747
  "learning_rate": 2.9644308677943315e-07,
748
+ "loss": 0.0256,
749
+ "reward": 0.7708333522081375,
750
+ "reward_std": 0.1862571120262146,
751
+ "rewards/accuracy_reward": 0.7708333522081375,
752
  "rewards/format_reward": 0.0,
753
  "step": 255
754
  },
755
  {
756
  "clip_ratio": 0.0,
757
+ "completion_length": 552.7604331970215,
758
  "epoch": 0.832,
759
+ "grad_norm": 0.1513671875,
760
+ "kl": 0.00014264740912039997,
761
  "learning_rate": 2.48140119418046e-07,
762
+ "loss": 0.0134,
763
+ "reward": 0.7645833477377891,
764
+ "reward_std": 0.14488959796726703,
765
+ "rewards/accuracy_reward": 0.7645833477377891,
766
  "rewards/format_reward": 0.0,
767
  "step": 260
768
  },
769
  {
770
  "clip_ratio": 0.0,
771
+ "completion_length": 589.2396003723145,
772
  "epoch": 0.848,
773
+ "grad_norm": 0.12158203125,
774
+ "kl": 0.00013351873640203847,
775
  "learning_rate": 2.0377596638451812e-07,
776
+ "loss": 0.0213,
777
+ "reward": 0.7312500141561031,
778
+ "reward_std": 0.15932335443794726,
779
+ "rewards/accuracy_reward": 0.7312500141561031,
780
  "rewards/format_reward": 0.0,
781
  "step": 265
782
  },
783
  {
784
  "clip_ratio": 0.0,
785
+ "completion_length": 559.054182434082,
786
  "epoch": 0.864,
787
+ "grad_norm": 0.080078125,
788
+ "kl": 0.00012574124430102528,
789
  "learning_rate": 1.634902137174483e-07,
790
+ "loss": 0.0195,
791
+ "reward": 0.7604166835546493,
792
+ "reward_std": 0.1461787685751915,
793
+ "rewards/accuracy_reward": 0.7604166835546493,
794
  "rewards/format_reward": 0.0,
795
  "step": 270
796
  },
797
  {
798
  "clip_ratio": 0.0,
799
+ "completion_length": 570.9770980834961,
800
  "epoch": 0.88,
801
+ "grad_norm": 0.07666015625,
802
+ "kl": 0.00012617979391507105,
803
  "learning_rate": 1.274096152990203e-07,
804
+ "loss": 0.0195,
805
+ "reward": 0.7875000156462193,
806
+ "reward_std": 0.13977919220924379,
807
+ "rewards/accuracy_reward": 0.7875000156462193,
808
  "rewards/format_reward": 0.0,
809
  "step": 275
810
  },
811
  {
812
  "clip_ratio": 0.0,
813
+ "completion_length": 558.9125160217285,
814
  "epoch": 0.896,
815
+ "grad_norm": 0.0869140625,
816
+ "kl": 0.00013949446292826906,
817
  "learning_rate": 9.564769404039419e-08,
818
+ "loss": 0.0141,
819
+ "reward": 0.7562500238418579,
820
+ "reward_std": 0.1631908681243658,
821
+ "rewards/accuracy_reward": 0.7562500238418579,
822
  "rewards/format_reward": 0.0,
823
  "step": 280
824
  },
825
  {
826
  "clip_ratio": 0.0,
827
+ "completion_length": 582.5541831970215,
828
  "epoch": 0.912,
829
+ "grad_norm": 0.11083984375,
830
+ "kl": 0.0001382285308864084,
831
  "learning_rate": 6.830438469662892e-08,
832
+ "loss": 0.0199,
833
+ "reward": 0.7208333518356085,
834
+ "reward_std": 0.18496794067323208,
835
+ "rewards/accuracy_reward": 0.7208333518356085,
836
  "rewards/format_reward": 0.0,
837
  "step": 285
838
  },
839
  {
840
  "clip_ratio": 0.0,
841
+ "completion_length": 548.1500137329101,
842
  "epoch": 0.928,
843
+ "grad_norm": 0.11181640625,
844
+ "kl": 0.00013111710741213755,
845
  "learning_rate": 4.546571943496969e-08,
846
+ "loss": 0.0209,
847
+ "reward": 0.7916666833683849,
848
+ "reward_std": 0.13496793992817402,
849
+ "rewards/accuracy_reward": 0.7916666833683849,
850
  "rewards/format_reward": 0.0,
851
  "step": 290
852
  },
853
  {
854
  "clip_ratio": 0.0,
855
+ "completion_length": 578.8479362487793,
856
  "epoch": 0.944,
857
+ "grad_norm": 0.107421875,
858
+ "kl": 0.00012809678955818526,
859
  "learning_rate": 2.72035571458224e-08,
860
+ "loss": 0.0199,
861
+ "reward": 0.7708333499729634,
862
+ "reward_std": 0.17440169714391232,
863
+ "rewards/accuracy_reward": 0.7708333499729634,
864
  "rewards/format_reward": 0.0,
865
  "step": 295
866
  },
867
  {
868
  "epoch": 0.96,
869
+ "grad_norm": 0.11083984375,
870
  "learning_rate": 1.357535734809795e-08,
871
+ "loss": 0.0226,
872
  "step": 300
873
  },
874
  {
875
  "epoch": 0.96,
876
  "eval_clip_ratio": 0.0,
877
+ "eval_completion_length": 467.9635856308623,
878
+ "eval_kl": 0.00010789919093974118,
879
+ "eval_loss": 0.0016380093293264508,
880
+ "eval_reward": 0.8510479179328073,
881
+ "eval_reward_std": 0.1320316564001723,
882
+ "eval_rewards/accuracy_reward": 0.8510479179328073,
883
  "eval_rewards/format_reward": 0.0,
884
+ "eval_runtime": 1178.6538,
885
+ "eval_samples_per_second": 0.848,
886
+ "eval_steps_per_second": 0.071,
887
  "step": 300
888
  },
889
  {
890
  "clip_ratio": 0.0,
891
+ "completion_length": 547.3937664031982,
892
  "epoch": 0.976,
893
+ "grad_norm": 0.1279296875,
894
+ "kl": 0.00013282527597766603,
895
  "learning_rate": 4.623999400308054e-09,
896
+ "loss": 0.0227,
897
+ "reward": 0.7614583466202021,
898
+ "reward_std": 0.15324607044458388,
899
+ "rewards/accuracy_reward": 0.7614583466202021,
900
  "rewards/format_reward": 0.0,
901
  "step": 305
902
  },
903
  {
904
  "clip_ratio": 0.0,
905
+ "completion_length": 554.6375144958496,
906
  "epoch": 0.992,
907
+ "grad_norm": 0.0791015625,
908
+ "kl": 0.0001264505321159959,
909
  "learning_rate": 3.77647586240204e-10,
910
+ "loss": 0.0163,
911
+ "reward": 0.7479166820645332,
912
+ "reward_std": 0.1224679384380579,
913
+ "rewards/accuracy_reward": 0.7479166820645332,
914
  "rewards/format_reward": 0.0,
915
  "step": 310
916
  },
917
  {
918
  "clip_ratio": 0.0,
919
+ "completion_length": 579.932300567627,
920
  "epoch": 0.9984,
921
+ "kl": 0.0001223308304361126,
922
+ "reward": 0.7343750149011612,
923
+ "reward_std": 0.17472399026155472,
924
+ "rewards/accuracy_reward": 0.7343750149011612,
925
  "rewards/format_reward": 0.0,
926
  "step": 312,
927
  "total_flos": 0.0,
928
+ "train_loss": 0.018009291775785804,
929
+ "train_runtime": 13979.803,
930
+ "train_samples_per_second": 0.536,
931
+ "train_steps_per_second": 0.022
932
  }
933
  ],
934
  "logging_steps": 5,
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75426cbb48810980c022ea74e3a8713bf31e2c007ce1c68940fbb9ed994cf59b
3
- size 7992
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd6460b35ce0d8ab64bd4020b3f435a6e689ff77619644b16252bbfb8c3b610d
3
+ size 6648