lalalaDa commited on
Commit
11219b3
·
verified ·
1 Parent(s): 429797a

Model save

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ model_name: ER-GRPO-alpha30
4
+ tags:
5
+ - generated_from_trainer
6
+ - trl
7
+ - grpo
8
+ licence: license
9
+ ---
10
+
11
+ # Model Card for ER-GRPO-alpha30
12
+
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
+ It has been trained using [TRL](https://github.com/huggingface/trl).
15
+
16
+ ## Quick start
17
+
18
+ ```python
19
+ from transformers import pipeline
20
+
21
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
22
+ generator = pipeline("text-generation", model="lalalaDa/ER-GRPO-alpha30", device="cuda")
23
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
24
+ print(output["generated_text"])
25
+ ```
26
+
27
+ ## Training procedure
28
+
29
+
30
+
31
+
32
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
33
+
34
+ ### Framework versions
35
+
36
+ - TRL: 0.18.1
37
+ - Transformers: 4.52.4
38
+ - Pytorch: 2.5.1
39
+ - Datasets: 3.6.0
40
+ - Tokenizers: 0.21.1
41
+
42
+ ## Citations
43
+
44
+ Cite GRPO as:
45
+
46
+ ```bibtex
47
+ @article{zhihong2024deepseekmath,
48
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
49
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
50
+ year = 2024,
51
+ eprint = {arXiv:2402.03300},
52
+ }
53
+
54
+ ```
55
+
56
+ Cite TRL as:
57
+
58
+ ```bibtex
59
+ @misc{vonwerra2022trl,
60
+ title = {{TRL: Transformer Reinforcement Learning}},
61
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallou{\'e}dec},
62
+ year = 2020,
63
+ journal = {GitHub repository},
64
+ publisher = {GitHub},
65
+ howpublished = {\url{https://github.com/huggingface/trl}}
66
+ }
67
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": -0.45832799572497607,
4
+ "train_runtime": 4441.2019,
5
+ "train_samples": 7000,
6
+ "train_samples_per_second": 0.54,
7
+ "train_steps_per_second": 0.011
8
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 151646,
4
+ "do_sample": true,
5
+ "eos_token_id": 151643,
6
+ "temperature": 0.6,
7
+ "top_p": 0.95,
8
+ "transformers_version": "4.52.4"
9
+ }
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": -0.45832799572497607,
4
+ "train_runtime": 4441.2019,
5
+ "train_samples": 7000,
6
+ "train_samples_per_second": 0.54,
7
+ "train_steps_per_second": 0.011
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1493 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.05714285714285714,
6
+ "eval_steps": 500,
7
+ "global_step": 50,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "clip_ratio/high_max": 0.0,
14
+ "clip_ratio/high_mean": 0.0,
15
+ "clip_ratio/low_mean": 0.0,
16
+ "clip_ratio/low_min": 0.0,
17
+ "clip_ratio/region_mean": 0.0,
18
+ "completions/clipped_ratio": 0.5208333333333333,
19
+ "completions/max_length": 3584.0,
20
+ "completions/max_terminated_length": 3128.0,
21
+ "completions/mean_length": 2584.104248046875,
22
+ "completions/mean_terminated_length": 1497.2608642578125,
23
+ "completions/min_length": 557.0,
24
+ "completions/min_terminated_length": 557.0,
25
+ "epoch": 0.001142857142857143,
26
+ "frac_reward_zero_std": 0.0,
27
+ "grad_norm": 1.6251945495605469,
28
+ "kl": 0.0,
29
+ "learning_rate": 0.0,
30
+ "loss": -0.6946,
31
+ "num_tokens": 131153.0,
32
+ "policy_entropy_avg": 8.125,
33
+ "reward": 0.2579294741153717,
34
+ "reward_std": 0.505131185054779,
35
+ "rewards/cosine_scaled_reward/mean": -0.062009382992982864,
36
+ "rewards/cosine_scaled_reward/std": 0.43048128485679626,
37
+ "rewards/format_reward/mean": 0.5208333134651184,
38
+ "rewards/format_reward/std": 0.504852294921875,
39
+ "step": 1
40
+ },
41
+ {
42
+ "clip_ratio/high_max": 0.0,
43
+ "clip_ratio/high_mean": 0.0,
44
+ "clip_ratio/low_mean": 0.0,
45
+ "clip_ratio/low_min": 0.0,
46
+ "clip_ratio/region_mean": 0.0,
47
+ "completions/clipped_ratio": 0.5833333333333333,
48
+ "completions/max_length": 3584.0,
49
+ "completions/max_terminated_length": 3280.0,
50
+ "completions/mean_length": 2761.666748046875,
51
+ "completions/mean_terminated_length": 1610.4000244140625,
52
+ "completions/min_length": 465.0,
53
+ "completions/min_terminated_length": 465.0,
54
+ "epoch": 0.002285714285714286,
55
+ "frac_reward_zero_std": 0.0,
56
+ "grad_norm": 1.6243528127670288,
57
+ "kl": 0.0,
58
+ "learning_rate": 2e-07,
59
+ "loss": -1.3501,
60
+ "num_tokens": 271243.0,
61
+ "policy_entropy_avg": 8.125,
62
+ "reward": 0.26637595891952515,
63
+ "reward_std": 0.5504351854324341,
64
+ "rewards/cosine_scaled_reward/mean": -0.003428752301260829,
65
+ "rewards/cosine_scaled_reward/std": 0.4935320317745209,
66
+ "rewards/format_reward/mean": 0.4166666567325592,
67
+ "rewards/format_reward/std": 0.49822381138801575,
68
+ "step": 2
69
+ },
70
+ {
71
+ "clip_ratio/high_max": 0.0,
72
+ "clip_ratio/high_mean": 0.0,
73
+ "clip_ratio/low_mean": 0.0,
74
+ "clip_ratio/low_min": 0.0,
75
+ "clip_ratio/region_mean": 0.0,
76
+ "completions/clipped_ratio": 0.9166666666666666,
77
+ "completions/max_length": 3584.0,
78
+ "completions/max_terminated_length": 2914.0,
79
+ "completions/mean_length": 3405.541748046875,
80
+ "completions/mean_terminated_length": 1442.5,
81
+ "completions/min_length": 490.0,
82
+ "completions/min_terminated_length": 490.0,
83
+ "epoch": 0.0034285714285714284,
84
+ "frac_reward_zero_std": 0.0,
85
+ "grad_norm": 0.6798118352890015,
86
+ "kl": 0.0006580352783203125,
87
+ "learning_rate": 4e-07,
88
+ "loss": 1.3636,
89
+ "num_tokens": 442563.0,
90
+ "policy_entropy_avg": 8.125,
91
+ "reward": -0.14459262788295746,
92
+ "reward_std": 0.37038296461105347,
93
+ "rewards/cosine_scaled_reward/mean": -0.16330842673778534,
94
+ "rewards/cosine_scaled_reward/std": 0.2756437659263611,
95
+ "rewards/format_reward/mean": 0.1041666641831398,
96
+ "rewards/format_reward/std": 0.3087092638015747,
97
+ "step": 3
98
+ },
99
+ {
100
+ "clip_ratio/high_max": 0.0,
101
+ "clip_ratio/high_mean": 0.0,
102
+ "clip_ratio/low_mean": 0.0,
103
+ "clip_ratio/low_min": 0.0,
104
+ "clip_ratio/region_mean": 0.0,
105
+ "completions/clipped_ratio": 0.45833333333333337,
106
+ "completions/max_length": 3584.0,
107
+ "completions/max_terminated_length": 3581.0,
108
+ "completions/mean_length": 2397.45849609375,
109
+ "completions/mean_terminated_length": 1393.4615478515625,
110
+ "completions/min_length": 416.0,
111
+ "completions/min_terminated_length": 416.0,
112
+ "epoch": 0.004571428571428572,
113
+ "frac_reward_zero_std": 0.0,
114
+ "grad_norm": 1.0801085233688354,
115
+ "kl": 0.0006548563639322916,
116
+ "learning_rate": 6e-07,
117
+ "loss": -0.4683,
118
+ "num_tokens": 564997.0,
119
+ "policy_entropy_avg": 8.125,
120
+ "reward": 0.4537672996520996,
121
+ "reward_std": 0.6287642121315002,
122
+ "rewards/cosine_scaled_reward/mean": 0.0053017884492874146,
123
+ "rewards/cosine_scaled_reward/std": 0.44670990109443665,
124
+ "rewards/format_reward/mean": 0.6875,
125
+ "rewards/format_reward/std": 0.4684174358844757,
126
+ "step": 4
127
+ },
128
+ {
129
+ "clip_ratio/high_max": 0.0,
130
+ "clip_ratio/high_mean": 0.0,
131
+ "clip_ratio/low_mean": 0.0,
132
+ "clip_ratio/low_min": 0.0,
133
+ "clip_ratio/region_mean": 0.0,
134
+ "completions/clipped_ratio": 0.7291666666666667,
135
+ "completions/max_length": 3584.0,
136
+ "completions/max_terminated_length": 3368.0,
137
+ "completions/mean_length": 3252.5,
138
+ "completions/mean_terminated_length": 2360.0,
139
+ "completions/min_length": 598.0,
140
+ "completions/min_terminated_length": 598.0,
141
+ "epoch": 0.005714285714285714,
142
+ "frac_reward_zero_std": 0.0,
143
+ "grad_norm": 0.6357815265655518,
144
+ "kl": 0.0006510416666666666,
145
+ "learning_rate": 8e-07,
146
+ "loss": -0.2046,
147
+ "num_tokens": 729229.0,
148
+ "policy_entropy_avg": 8.125,
149
+ "reward": 0.025602590292692184,
150
+ "reward_std": 0.46193015575408936,
151
+ "rewards/cosine_scaled_reward/mean": -0.15738904476165771,
152
+ "rewards/cosine_scaled_reward/std": 0.38784292340278625,
153
+ "rewards/format_reward/mean": 0.3541666567325592,
154
+ "rewards/format_reward/std": 0.48332110047340393,
155
+ "step": 5
156
+ },
157
+ {
158
+ "clip_ratio/high_max": 0.0,
159
+ "clip_ratio/high_mean": 0.0,
160
+ "clip_ratio/low_mean": 0.0,
161
+ "clip_ratio/low_min": 0.0,
162
+ "clip_ratio/region_mean": 0.0,
163
+ "completions/clipped_ratio": 0.7916666666666666,
164
+ "completions/max_length": 3584.0,
165
+ "completions/max_terminated_length": 3435.0,
166
+ "completions/mean_length": 3184.45849609375,
167
+ "completions/mean_terminated_length": 1666.2000732421875,
168
+ "completions/min_length": 600.0,
169
+ "completions/min_terminated_length": 600.0,
170
+ "epoch": 0.006857142857142857,
171
+ "frac_reward_zero_std": 0.0,
172
+ "grad_norm": 0.611210286617279,
173
+ "kl": 0.0006860097249348959,
174
+ "learning_rate": 1e-06,
175
+ "loss": 0.6384,
176
+ "num_tokens": 890819.0,
177
+ "policy_entropy_avg": 8.125,
178
+ "reward": 0.006859039422124624,
179
+ "reward_std": 0.45094144344329834,
180
+ "rewards/cosine_scaled_reward/mean": -0.14055712521076202,
181
+ "rewards/cosine_scaled_reward/std": 0.32825249433517456,
182
+ "rewards/format_reward/mean": 0.2916666567325592,
183
+ "rewards/format_reward/std": 0.4593396484851837,
184
+ "step": 6
185
+ },
186
+ {
187
+ "clip_ratio/high_max": 0.0,
188
+ "clip_ratio/high_mean": 0.0,
189
+ "clip_ratio/low_mean": 0.0,
190
+ "clip_ratio/low_min": 0.0,
191
+ "clip_ratio/region_mean": 0.0,
192
+ "completions/clipped_ratio": 0.6041666666666667,
193
+ "completions/max_length": 3584.0,
194
+ "completions/max_terminated_length": 3543.0,
195
+ "completions/mean_length": 3090.0,
196
+ "completions/mean_terminated_length": 2336.0,
197
+ "completions/min_length": 952.0,
198
+ "completions/min_terminated_length": 952.0,
199
+ "epoch": 0.008,
200
+ "frac_reward_zero_std": 0.0,
201
+ "grad_norm": 0.6517191529273987,
202
+ "kl": 0.0005308787027994791,
203
+ "learning_rate": 9.989038226169207e-07,
204
+ "loss": -0.3634,
205
+ "num_tokens": 1046945.0,
206
+ "policy_entropy_avg": 8.125,
207
+ "reward": 0.3611307144165039,
208
+ "reward_std": 0.6935849189758301,
209
+ "rewards/cosine_scaled_reward/mean": 0.017376163974404335,
210
+ "rewards/cosine_scaled_reward/std": 0.5143836736679077,
211
+ "rewards/format_reward/mean": 0.5208333134651184,
212
+ "rewards/format_reward/std": 0.5048523545265198,
213
+ "step": 7
214
+ },
215
+ {
216
+ "clip_ratio/high_max": 0.0,
217
+ "clip_ratio/high_mean": 0.0,
218
+ "clip_ratio/low_mean": 0.0,
219
+ "clip_ratio/low_min": 0.0,
220
+ "clip_ratio/region_mean": 0.0,
221
+ "completions/clipped_ratio": 0.5625,
222
+ "completions/max_length": 3584.0,
223
+ "completions/max_terminated_length": 3421.0,
224
+ "completions/mean_length": 2711.27099609375,
225
+ "completions/mean_terminated_length": 1589.1905517578125,
226
+ "completions/min_length": 375.0,
227
+ "completions/min_terminated_length": 375.0,
228
+ "epoch": 0.009142857142857144,
229
+ "frac_reward_zero_std": 0.0,
230
+ "grad_norm": 1.013113021850586,
231
+ "kl": 0.0005372365315755209,
232
+ "learning_rate": 9.956206309337066e-07,
233
+ "loss": -1.336,
234
+ "num_tokens": 1184898.0,
235
+ "policy_entropy_avg": 8.125,
236
+ "reward": 0.5312491655349731,
237
+ "reward_std": 0.5121049880981445,
238
+ "rewards/cosine_scaled_reward/mean": 0.16906984150409698,
239
+ "rewards/cosine_scaled_reward/std": 0.48645660281181335,
240
+ "rewards/format_reward/mean": 0.4791666567325592,
241
+ "rewards/format_reward/std": 0.5048523545265198,
242
+ "step": 8
243
+ },
244
+ {
245
+ "clip_ratio/high_max": 0.0,
246
+ "clip_ratio/high_mean": 0.0,
247
+ "clip_ratio/low_mean": 0.0,
248
+ "clip_ratio/low_min": 0.0,
249
+ "clip_ratio/region_mean": 0.0,
250
+ "completions/clipped_ratio": 0.625,
251
+ "completions/max_length": 3584.0,
252
+ "completions/max_terminated_length": 3575.0,
253
+ "completions/mean_length": 2904.875,
254
+ "completions/mean_terminated_length": 1773.0,
255
+ "completions/min_length": 553.0,
256
+ "completions/min_terminated_length": 553.0,
257
+ "epoch": 0.010285714285714285,
258
+ "frac_reward_zero_std": 0.0,
259
+ "grad_norm": 0.7274989485740662,
260
+ "kl": 0.00063323974609375,
261
+ "learning_rate": 9.901664203302124e-07,
262
+ "loss": 0.2595,
263
+ "num_tokens": 1332924.0,
264
+ "policy_entropy_avg": 8.125,
265
+ "reward": 0.15100964903831482,
266
+ "reward_std": 0.5337426066398621,
267
+ "rewards/cosine_scaled_reward/mean": -0.09217208623886108,
268
+ "rewards/cosine_scaled_reward/std": 0.36159586906433105,
269
+ "rewards/format_reward/mean": 0.4166666567325592,
270
+ "rewards/format_reward/std": 0.49822381138801575,
271
+ "step": 9
272
+ },
273
+ {
274
+ "clip_ratio/high_max": 0.0,
275
+ "clip_ratio/high_mean": 0.0,
276
+ "clip_ratio/low_mean": 0.0,
277
+ "clip_ratio/low_min": 0.0,
278
+ "clip_ratio/region_mean": 0.0,
279
+ "completions/clipped_ratio": 0.6875,
280
+ "completions/max_length": 3584.0,
281
+ "completions/max_terminated_length": 3407.0,
282
+ "completions/mean_length": 2811.229248046875,
283
+ "completions/mean_terminated_length": 1111.1334228515625,
284
+ "completions/min_length": 407.0,
285
+ "completions/min_terminated_length": 407.0,
286
+ "epoch": 0.011428571428571429,
287
+ "frac_reward_zero_std": 0.0,
288
+ "grad_norm": 0.9453772306442261,
289
+ "kl": 0.0005699793497721354,
290
+ "learning_rate": 9.825677631722435e-07,
291
+ "loss": 0.1,
292
+ "num_tokens": 1475987.0,
293
+ "policy_entropy_avg": 8.125,
294
+ "reward": 0.001259398995898664,
295
+ "reward_std": 0.425419420003891,
296
+ "rewards/cosine_scaled_reward/mean": -0.17611455917358398,
297
+ "rewards/cosine_scaled_reward/std": 0.3572066128253937,
298
+ "rewards/format_reward/mean": 0.3541666567325592,
299
+ "rewards/format_reward/std": 0.48332110047340393,
300
+ "step": 10
301
+ },
302
+ {
303
+ "clip_ratio/high_max": 0.0,
304
+ "clip_ratio/high_mean": 0.0,
305
+ "clip_ratio/low_mean": 0.0,
306
+ "clip_ratio/low_min": 0.0,
307
+ "clip_ratio/region_mean": 0.0,
308
+ "completions/clipped_ratio": 0.8958333333333334,
309
+ "completions/max_length": 3584.0,
310
+ "completions/max_terminated_length": 3246.0,
311
+ "completions/mean_length": 3474.0,
312
+ "completions/mean_terminated_length": 2528.0,
313
+ "completions/min_length": 1503.0,
314
+ "completions/min_terminated_length": 1503.0,
315
+ "epoch": 0.012571428571428572,
316
+ "frac_reward_zero_std": 0.0,
317
+ "grad_norm": 0.8819871544837952,
318
+ "kl": 0.0006033579508463541,
319
+ "learning_rate": 9.728616793536587e-07,
320
+ "loss": 1.6889,
321
+ "num_tokens": 1651493.0,
322
+ "policy_entropy_avg": 8.125,
323
+ "reward": -0.1606103479862213,
324
+ "reward_std": 0.441455602645874,
325
+ "rewards/cosine_scaled_reward/mean": -0.19646306335926056,
326
+ "rewards/cosine_scaled_reward/std": 0.34247782826423645,
327
+ "rewards/format_reward/mean": 0.1458333283662796,
328
+ "rewards/format_reward/std": 0.3566739559173584,
329
+ "step": 11
330
+ },
331
+ {
332
+ "clip_ratio/high_max": 0.0,
333
+ "clip_ratio/high_mean": 0.0,
334
+ "clip_ratio/low_mean": 0.0,
335
+ "clip_ratio/low_min": 0.0,
336
+ "clip_ratio/region_mean": 0.0,
337
+ "completions/clipped_ratio": 0.45833333333333337,
338
+ "completions/max_length": 3584.0,
339
+ "completions/max_terminated_length": 3479.0,
340
+ "completions/mean_length": 2459.3125,
341
+ "completions/mean_terminated_length": 1507.6539306640625,
342
+ "completions/min_length": 698.0,
343
+ "completions/min_terminated_length": 698.0,
344
+ "epoch": 0.013714285714285714,
345
+ "frac_reward_zero_std": 0.0,
346
+ "grad_norm": 0.9987263083457947,
347
+ "kl": 0.0006459554036458334,
348
+ "learning_rate": 9.610954559391704e-07,
349
+ "loss": -0.7409,
350
+ "num_tokens": 1777748.0,
351
+ "policy_entropy_avg": 8.125,
352
+ "reward": 0.1868935525417328,
353
+ "reward_std": 0.5621633529663086,
354
+ "rewards/cosine_scaled_reward/mean": -0.13748572766780853,
355
+ "rewards/cosine_scaled_reward/std": 0.37971118092536926,
356
+ "rewards/format_reward/mean": 0.5625,
357
+ "rewards/format_reward/std": 0.5013279914855957,
358
+ "step": 12
359
+ },
360
+ {
361
+ "clip_ratio/high_max": 0.0,
362
+ "clip_ratio/high_mean": 0.0,
363
+ "clip_ratio/low_mean": 0.0,
364
+ "clip_ratio/low_min": 0.0,
365
+ "clip_ratio/region_mean": 0.0,
366
+ "completions/clipped_ratio": 0.5625,
367
+ "completions/max_length": 3584.0,
368
+ "completions/max_terminated_length": 3377.0,
369
+ "completions/mean_length": 2769.729248046875,
370
+ "completions/mean_terminated_length": 1722.8095703125,
371
+ "completions/min_length": 706.0,
372
+ "completions/min_terminated_length": 706.0,
373
+ "epoch": 0.014857142857142857,
374
+ "frac_reward_zero_std": 0.0,
375
+ "grad_norm": 0.6482232213020325,
376
+ "kl": 0.0006001790364583334,
377
+ "learning_rate": 9.473264167865171e-07,
378
+ "loss": -0.0518,
379
+ "num_tokens": 1918975.0,
380
+ "policy_entropy_avg": 8.125,
381
+ "reward": 0.15883508324623108,
382
+ "reward_std": 0.4439440965652466,
383
+ "rewards/cosine_scaled_reward/mean": -0.09656915813684464,
384
+ "rewards/cosine_scaled_reward/std": 0.38122376799583435,
385
+ "rewards/format_reward/mean": 0.4375,
386
+ "rewards/format_reward/std": 0.5013279914855957,
387
+ "step": 13
388
+ },
389
+ {
390
+ "clip_ratio/high_max": 0.0,
391
+ "clip_ratio/high_mean": 0.0,
392
+ "clip_ratio/low_mean": 0.0,
393
+ "clip_ratio/low_min": 0.0,
394
+ "clip_ratio/region_mean": 0.0,
395
+ "completions/clipped_ratio": 0.5208333333333333,
396
+ "completions/max_length": 3584.0,
397
+ "completions/max_terminated_length": 3364.0,
398
+ "completions/mean_length": 2708.541748046875,
399
+ "completions/mean_terminated_length": 1756.95654296875,
400
+ "completions/min_length": 661.0,
401
+ "completions/min_terminated_length": 661.0,
402
+ "epoch": 0.016,
403
+ "frac_reward_zero_std": 0.0,
404
+ "grad_norm": 1.1001851558685303,
405
+ "kl": 0.0006243387858072916,
406
+ "learning_rate": 9.316216432703916e-07,
407
+ "loss": -0.6409,
408
+ "num_tokens": 2056941.0,
409
+ "policy_entropy_avg": 8.125,
410
+ "reward": 0.2933271527290344,
411
+ "reward_std": 0.5121263861656189,
412
+ "rewards/cosine_scaled_reward/mean": -0.013947081752121449,
413
+ "rewards/cosine_scaled_reward/std": 0.49587875604629517,
414
+ "rewards/format_reward/mean": 0.4791666567325592,
415
+ "rewards/format_reward/std": 0.5048523545265198,
416
+ "step": 14
417
+ },
418
+ {
419
+ "clip_ratio/high_max": 0.0,
420
+ "clip_ratio/high_mean": 0.0,
421
+ "clip_ratio/low_mean": 0.0,
422
+ "clip_ratio/low_min": 0.0,
423
+ "clip_ratio/region_mean": 0.0,
424
+ "completions/clipped_ratio": 0.5833333333333333,
425
+ "completions/max_length": 3584.0,
426
+ "completions/max_terminated_length": 3430.0,
427
+ "completions/mean_length": 2640.125,
428
+ "completions/mean_terminated_length": 1318.7000732421875,
429
+ "completions/min_length": 425.0,
430
+ "completions/min_terminated_length": 425.0,
431
+ "epoch": 0.017142857142857144,
432
+ "frac_reward_zero_std": 0.0,
433
+ "grad_norm": 1.0151442289352417,
434
+ "kl": 0.0005944569905598959,
435
+ "learning_rate": 9.140576474687263e-07,
436
+ "loss": -0.6683,
437
+ "num_tokens": 2191857.0,
438
+ "policy_entropy_avg": 8.125,
439
+ "reward": 0.2889822721481323,
440
+ "reward_std": 0.31458932161331177,
441
+ "rewards/cosine_scaled_reward/mean": -0.038122642785310745,
442
+ "rewards/cosine_scaled_reward/std": 0.44473376870155334,
443
+ "rewards/format_reward/mean": 0.5208333134651184,
444
+ "rewards/format_reward/std": 0.5048523545265198,
445
+ "step": 15
446
+ },
447
+ {
448
+ "clip_ratio/high_max": 0.0,
449
+ "clip_ratio/high_mean": 0.0,
450
+ "clip_ratio/low_mean": 0.0,
451
+ "clip_ratio/low_min": 0.0,
452
+ "clip_ratio/region_mean": 0.0,
453
+ "completions/clipped_ratio": 0.9583333333333334,
454
+ "completions/max_length": 3584.0,
455
+ "completions/max_terminated_length": 3304.0,
456
+ "completions/mean_length": 3522.1875,
457
+ "completions/mean_terminated_length": 2100.5,
458
+ "completions/min_length": 897.0,
459
+ "completions/min_terminated_length": 897.0,
460
+ "epoch": 0.018285714285714287,
461
+ "frac_reward_zero_std": 0.0,
462
+ "grad_norm": 0.9443197250366211,
463
+ "kl": 0.0007228851318359375,
464
+ "learning_rate": 8.9471999940354e-07,
465
+ "loss": 2.2731,
466
+ "num_tokens": 2368752.0,
467
+ "policy_entropy_avg": 8.125,
468
+ "reward": -0.21434035897254944,
469
+ "reward_std": 0.31254154443740845,
470
+ "rewards/cosine_scaled_reward/mean": -0.20654386281967163,
471
+ "rewards/cosine_scaled_reward/std": 0.2568126320838928,
472
+ "rewards/format_reward/mean": 0.0833333358168602,
473
+ "rewards/format_reward/std": 0.2793101966381073,
474
+ "step": 16
475
+ },
476
+ {
477
+ "clip_ratio/high_max": 0.0,
478
+ "clip_ratio/high_mean": 0.0,
479
+ "clip_ratio/low_mean": 0.0,
480
+ "clip_ratio/low_min": 0.0,
481
+ "clip_ratio/region_mean": 0.0,
482
+ "completions/clipped_ratio": 0.45833333333333337,
483
+ "completions/max_length": 3584.0,
484
+ "completions/max_terminated_length": 3076.0,
485
+ "completions/mean_length": 2299.875,
486
+ "completions/mean_terminated_length": 1213.3077392578125,
487
+ "completions/min_length": 364.0,
488
+ "completions/min_terminated_length": 364.0,
489
+ "epoch": 0.019428571428571427,
490
+ "frac_reward_zero_std": 0.0,
491
+ "grad_norm": 0.764594554901123,
492
+ "kl": 0.000736236572265625,
493
+ "learning_rate": 8.737029101523929e-07,
494
+ "loss": -0.0316,
495
+ "num_tokens": 2487036.0,
496
+ "policy_entropy_avg": 8.125,
497
+ "reward": 0.3496870696544647,
498
+ "reward_std": 0.5126497745513916,
499
+ "rewards/cosine_scaled_reward/mean": -0.0018433034420013428,
500
+ "rewards/cosine_scaled_reward/std": 0.4425233006477356,
501
+ "rewards/format_reward/mean": 0.5416666865348816,
502
+ "rewards/format_reward/std": 0.5035336017608643,
503
+ "step": 17
504
+ },
505
+ {
506
+ "clip_ratio/high_max": 0.0,
507
+ "clip_ratio/high_mean": 0.0,
508
+ "clip_ratio/low_mean": 0.0,
509
+ "clip_ratio/low_min": 0.0,
510
+ "clip_ratio/region_mean": 0.0,
511
+ "completions/clipped_ratio": 0.5416666666666667,
512
+ "completions/max_length": 3584.0,
513
+ "completions/max_terminated_length": 3228.0,
514
+ "completions/mean_length": 2700.3125,
515
+ "completions/mean_terminated_length": 1655.95458984375,
516
+ "completions/min_length": 536.0,
517
+ "completions/min_terminated_length": 536.0,
518
+ "epoch": 0.02057142857142857,
519
+ "frac_reward_zero_std": 0.0,
520
+ "grad_norm": 0.8999841213226318,
521
+ "kl": 0.0006821950276692709,
522
+ "learning_rate": 8.511087728614862e-07,
523
+ "loss": -0.6147,
524
+ "num_tokens": 2624433.0,
525
+ "policy_entropy_avg": 8.125,
526
+ "reward": 0.26751697063446045,
527
+ "reward_std": 0.4905094802379608,
528
+ "rewards/cosine_scaled_reward/mean": -0.05463438108563423,
529
+ "rewards/cosine_scaled_reward/std": 0.447592556476593,
530
+ "rewards/format_reward/mean": 0.5208333134651184,
531
+ "rewards/format_reward/std": 0.5048523545265198,
532
+ "step": 18
533
+ },
534
+ {
535
+ "clip_ratio/high_max": 0.0,
536
+ "clip_ratio/high_mean": 0.0,
537
+ "clip_ratio/low_mean": 0.0,
538
+ "clip_ratio/low_min": 0.0,
539
+ "clip_ratio/region_mean": 0.0,
540
+ "completions/clipped_ratio": 0.6666666666666667,
541
+ "completions/max_length": 3584.0,
542
+ "completions/max_terminated_length": 3405.0,
543
+ "completions/mean_length": 2786.77099609375,
544
+ "completions/mean_terminated_length": 1192.3125,
545
+ "completions/min_length": 397.0,
546
+ "completions/min_terminated_length": 397.0,
547
+ "epoch": 0.021714285714285714,
548
+ "frac_reward_zero_std": 0.0,
549
+ "grad_norm": 1.0238417387008667,
550
+ "kl": 0.0006643931070963541,
551
+ "learning_rate": 8.270476638965461e-07,
552
+ "loss": -0.0253,
553
+ "num_tokens": 2766640.0,
554
+ "policy_entropy_avg": 8.125,
555
+ "reward": 0.3180326819419861,
556
+ "reward_std": 0.5151762366294861,
557
+ "rewards/cosine_scaled_reward/mean": 0.03630712628364563,
558
+ "rewards/cosine_scaled_reward/std": 0.47822093963623047,
559
+ "rewards/format_reward/mean": 0.4166666567325592,
560
+ "rewards/format_reward/std": 0.49822381138801575,
561
+ "step": 19
562
+ },
563
+ {
564
+ "clip_ratio/high_max": 0.0,
565
+ "clip_ratio/high_mean": 0.0,
566
+ "clip_ratio/low_mean": 0.0,
567
+ "clip_ratio/low_min": 0.0,
568
+ "clip_ratio/region_mean": 0.0,
569
+ "completions/clipped_ratio": 0.375,
570
+ "completions/max_length": 3584.0,
571
+ "completions/max_terminated_length": 3561.0,
572
+ "completions/mean_length": 2221.1875,
573
+ "completions/mean_terminated_length": 1403.5001220703125,
574
+ "completions/min_length": 359.0,
575
+ "completions/min_terminated_length": 359.0,
576
+ "epoch": 0.022857142857142857,
577
+ "frac_reward_zero_std": 0.0,
578
+ "grad_norm": 1.0431965589523315,
579
+ "kl": 0.0006084442138671875,
580
+ "learning_rate": 8.01636806561836e-07,
581
+ "loss": -0.6551,
582
+ "num_tokens": 2881771.0,
583
+ "policy_entropy_avg": 8.125,
584
+ "reward": 0.3996829688549042,
585
+ "reward_std": 0.34862709045410156,
586
+ "rewards/cosine_scaled_reward/mean": -0.06755157560110092,
587
+ "rewards/cosine_scaled_reward/std": 0.39543306827545166,
588
+ "rewards/format_reward/mean": 0.75,
589
+ "rewards/format_reward/std": 0.4375949800014496,
590
+ "step": 20
591
+ },
592
+ {
593
+ "clip_ratio/high_max": 0.0,
594
+ "clip_ratio/high_mean": 0.0,
595
+ "clip_ratio/low_mean": 0.0,
596
+ "clip_ratio/low_min": 0.0,
597
+ "clip_ratio/region_mean": 0.0,
598
+ "completions/clipped_ratio": 0.5208333333333333,
599
+ "completions/max_length": 3584.0,
600
+ "completions/max_terminated_length": 3408.0,
601
+ "completions/mean_length": 2486.52099609375,
602
+ "completions/mean_terminated_length": 1293.6087646484375,
603
+ "completions/min_length": 343.0,
604
+ "completions/min_terminated_length": 343.0,
605
+ "epoch": 0.024,
606
+ "frac_reward_zero_std": 0.0,
607
+ "grad_norm": 1.1584967374801636,
608
+ "kl": 0.0008227030436197916,
609
+ "learning_rate": 7.75e-07,
610
+ "loss": -0.3451,
611
+ "num_tokens": 3009230.0,
612
+ "policy_entropy_avg": 8.125,
613
+ "reward": 0.08958987891674042,
614
+ "reward_std": 0.3577960431575775,
615
+ "rewards/cosine_scaled_reward/mean": -0.17066805064678192,
616
+ "rewards/cosine_scaled_reward/std": 0.31892454624176025,
617
+ "rewards/format_reward/mean": 0.4791666567325592,
618
+ "rewards/format_reward/std": 0.5048523545265198,
619
+ "step": 21
620
+ },
621
+ {
622
+ "clip_ratio/high_max": 0.0,
623
+ "clip_ratio/high_mean": 0.0,
624
+ "clip_ratio/low_mean": 0.0,
625
+ "clip_ratio/low_min": 0.0,
626
+ "clip_ratio/region_mean": 0.0,
627
+ "completions/clipped_ratio": 0.22916666666666663,
628
+ "completions/max_length": 3584.0,
629
+ "completions/max_terminated_length": 3488.0,
630
+ "completions/mean_length": 1677.9583740234375,
631
+ "completions/mean_terminated_length": 1111.29736328125,
632
+ "completions/min_length": 253.0,
633
+ "completions/min_terminated_length": 253.0,
634
+ "epoch": 0.025142857142857144,
635
+ "frac_reward_zero_std": 0.0,
636
+ "grad_norm": 2.2852933406829834,
637
+ "kl": 0.000949859619140625,
638
+ "learning_rate": 7.472670160550848e-07,
639
+ "loss": -2.4718,
640
+ "num_tokens": 3096534.0,
641
+ "policy_entropy_avg": 8.125,
642
+ "reward": 0.5749191045761108,
643
+ "reward_std": 0.3676217198371887,
644
+ "rewards/cosine_scaled_reward/mean": 0.015162050724029541,
645
+ "rewards/cosine_scaled_reward/std": 0.4574022591114044,
646
+ "rewards/format_reward/mean": 0.8541666865348816,
647
+ "rewards/format_reward/std": 0.3566739857196808,
648
+ "step": 22
649
+ },
650
+ {
651
+ "clip_ratio/high_max": 0.0,
652
+ "clip_ratio/high_mean": 0.0,
653
+ "clip_ratio/low_mean": 0.0,
654
+ "clip_ratio/low_min": 0.0,
655
+ "clip_ratio/region_mean": 0.0,
656
+ "completions/clipped_ratio": 0.4375,
657
+ "completions/max_length": 3584.0,
658
+ "completions/max_terminated_length": 3523.0,
659
+ "completions/mean_length": 2406.979248046875,
660
+ "completions/mean_terminated_length": 1491.5185546875,
661
+ "completions/min_length": 402.0,
662
+ "completions/min_terminated_length": 402.0,
663
+ "epoch": 0.026285714285714287,
664
+ "frac_reward_zero_std": 0.0,
665
+ "grad_norm": 0.7508360147476196,
666
+ "kl": 0.0007603963216145834,
667
+ "learning_rate": 7.185729670371604e-07,
668
+ "loss": 0.5997,
669
+ "num_tokens": 3220163.0,
670
+ "policy_entropy_avg": 8.125,
671
+ "reward": 0.29059678316116333,
672
+ "reward_std": 0.5978084802627563,
673
+ "rewards/cosine_scaled_reward/mean": -0.06813069432973862,
674
+ "rewards/cosine_scaled_reward/std": 0.44070079922676086,
675
+ "rewards/format_reward/mean": 0.5833333134651184,
676
+ "rewards/format_reward/std": 0.49822378158569336,
677
+ "step": 23
678
+ },
679
+ {
680
+ "clip_ratio/high_max": 0.0,
681
+ "clip_ratio/high_mean": 0.0,
682
+ "clip_ratio/low_mean": 0.0,
683
+ "clip_ratio/low_min": 0.0,
684
+ "clip_ratio/region_mean": 0.0,
685
+ "completions/clipped_ratio": 0.5,
686
+ "completions/max_length": 3584.0,
687
+ "completions/max_terminated_length": 3552.0,
688
+ "completions/mean_length": 2770.39599609375,
689
+ "completions/mean_terminated_length": 1956.791748046875,
690
+ "completions/min_length": 381.0,
691
+ "completions/min_terminated_length": 381.0,
692
+ "epoch": 0.027428571428571427,
693
+ "frac_reward_zero_std": 0.0,
694
+ "grad_norm": 0.5464559197425842,
695
+ "kl": 0.0007966359456380209,
696
+ "learning_rate": 6.890576474687263e-07,
697
+ "loss": -0.6853,
698
+ "num_tokens": 3361338.0,
699
+ "policy_entropy_avg": 8.125,
700
+ "reward": 0.2939620018005371,
701
+ "reward_std": 0.60921311378479,
702
+ "rewards/cosine_scaled_reward/mean": -0.034292057156562805,
703
+ "rewards/cosine_scaled_reward/std": 0.44030100107192993,
704
+ "rewards/format_reward/mean": 0.5208333134651184,
705
+ "rewards/format_reward/std": 0.5048523545265198,
706
+ "step": 24
707
+ },
708
+ {
709
+ "clip_ratio/high_max": 0.0,
710
+ "clip_ratio/high_mean": 0.0,
711
+ "clip_ratio/low_mean": 0.0,
712
+ "clip_ratio/low_min": 0.0,
713
+ "clip_ratio/region_mean": 0.0,
714
+ "completions/clipped_ratio": 0.6041666666666667,
715
+ "completions/max_length": 3584.0,
716
+ "completions/max_terminated_length": 2654.0,
717
+ "completions/mean_length": 2720.8125,
718
+ "completions/mean_terminated_length": 1403.3157958984375,
719
+ "completions/min_length": 509.0,
720
+ "completions/min_terminated_length": 509.0,
721
+ "epoch": 0.02857142857142857,
722
+ "frac_reward_zero_std": 0.0,
723
+ "grad_norm": 0.8707253336906433,
724
+ "kl": 0.000972747802734375,
725
+ "learning_rate": 6.588648530198504e-07,
726
+ "loss": 0.227,
727
+ "num_tokens": 3499659.0,
728
+ "policy_entropy_avg": 8.125,
729
+ "reward": 0.19027817249298096,
730
+ "reward_std": 0.5284578204154968,
731
+ "rewards/cosine_scaled_reward/mean": -0.051548827439546585,
732
+ "rewards/cosine_scaled_reward/std": 0.4388459026813507,
733
+ "rewards/format_reward/mean": 0.3958333432674408,
734
+ "rewards/format_reward/std": 0.49420398473739624,
735
+ "step": 25
736
+ },
737
+ {
738
+ "clip_ratio/high_max": 0.0,
739
+ "clip_ratio/high_mean": 0.0,
740
+ "clip_ratio/low_mean": 0.0,
741
+ "clip_ratio/low_min": 0.0,
742
+ "clip_ratio/region_mean": 0.0,
743
+ "completions/clipped_ratio": 0.6041666666666667,
744
+ "completions/max_length": 3584.0,
745
+ "completions/max_terminated_length": 3077.0,
746
+ "completions/mean_length": 2909.729248046875,
747
+ "completions/mean_terminated_length": 1880.5789794921875,
748
+ "completions/min_length": 943.0,
749
+ "completions/min_terminated_length": 943.0,
750
+ "epoch": 0.029714285714285714,
751
+ "frac_reward_zero_std": 0.0,
752
+ "grad_norm": 0.9067810773849487,
753
+ "kl": 0.0007839202880859375,
754
+ "learning_rate": 6.281416799501187e-07,
755
+ "loss": 0.0152,
756
+ "num_tokens": 3647300.0,
757
+ "policy_entropy_avg": 8.125,
758
+ "reward": 0.1333228200674057,
759
+ "reward_std": 0.3536017835140228,
760
+ "rewards/cosine_scaled_reward/mean": -0.1057773232460022,
761
+ "rewards/cosine_scaled_reward/std": 0.3763800859451294,
762
+ "rewards/format_reward/mean": 0.4166666567325592,
763
+ "rewards/format_reward/std": 0.49822381138801575,
764
+ "step": 26
765
+ },
766
+ {
767
+ "clip_ratio/high_max": 0.0,
768
+ "clip_ratio/high_mean": 0.0,
769
+ "clip_ratio/low_mean": 0.0,
770
+ "clip_ratio/low_min": 0.0,
771
+ "clip_ratio/region_mean": 0.0,
772
+ "completions/clipped_ratio": 0.6458333333333333,
773
+ "completions/max_length": 3584.0,
774
+ "completions/max_terminated_length": 3495.0,
775
+ "completions/mean_length": 2900.666748046875,
776
+ "completions/mean_terminated_length": 1654.5882568359375,
777
+ "completions/min_length": 471.0,
778
+ "completions/min_terminated_length": 471.0,
779
+ "epoch": 0.030857142857142857,
780
+ "frac_reward_zero_std": 0.0,
781
+ "grad_norm": 2.270411729812622,
782
+ "kl": 0.00095367431640625,
783
+ "learning_rate": 5.97037808470444e-07,
784
+ "loss": -0.0823,
785
+ "num_tokens": 3794248.0,
786
+ "policy_entropy_avg": 8.125,
787
+ "reward": 0.1146991103887558,
788
+ "reward_std": 0.4322494864463806,
789
+ "rewards/cosine_scaled_reward/mean": -0.10968658328056335,
790
+ "rewards/cosine_scaled_reward/std": 0.319176584482193,
791
+ "rewards/format_reward/mean": 0.3958333432674408,
792
+ "rewards/format_reward/std": 0.49420398473739624,
793
+ "step": 27
794
+ },
795
+ {
796
+ "clip_ratio/high_max": 0.0,
797
+ "clip_ratio/high_mean": 0.0,
798
+ "clip_ratio/low_mean": 0.0,
799
+ "clip_ratio/low_min": 0.0,
800
+ "clip_ratio/region_mean": 0.0,
801
+ "completions/clipped_ratio": 0.5208333333333333,
802
+ "completions/max_length": 3584.0,
803
+ "completions/max_terminated_length": 3292.0,
804
+ "completions/mean_length": 2623.14599609375,
805
+ "completions/mean_terminated_length": 1578.7391357421875,
806
+ "completions/min_length": 433.0,
807
+ "completions/min_terminated_length": 433.0,
808
+ "epoch": 0.032,
809
+ "frac_reward_zero_std": 0.0,
810
+ "grad_norm": 1.2852171659469604,
811
+ "kl": 0.0007775624593098959,
812
+ "learning_rate": 5.657047735161255e-07,
813
+ "loss": -0.6041,
814
+ "num_tokens": 3927911.0,
815
+ "policy_entropy_avg": 8.125,
816
+ "reward": 0.45130008459091187,
817
+ "reward_std": 0.44697779417037964,
818
+ "rewards/cosine_scaled_reward/mean": 0.08673719316720963,
819
+ "rewards/cosine_scaled_reward/std": 0.49450618028640747,
820
+ "rewards/format_reward/mean": 0.5208333134651184,
821
+ "rewards/format_reward/std": 0.5048523545265198,
822
+ "step": 28
823
+ },
824
+ {
825
+ "clip_ratio/high_max": 0.0,
826
+ "clip_ratio/high_mean": 0.0,
827
+ "clip_ratio/low_mean": 0.0,
828
+ "clip_ratio/low_min": 0.0,
829
+ "clip_ratio/region_mean": 0.0,
830
+ "completions/clipped_ratio": 0.8125,
831
+ "completions/max_length": 3584.0,
832
+ "completions/max_terminated_length": 3166.0,
833
+ "completions/mean_length": 3315.20849609375,
834
+ "completions/mean_terminated_length": 2150.444580078125,
835
+ "completions/min_length": 597.0,
836
+ "completions/min_terminated_length": 597.0,
837
+ "epoch": 0.03314285714285714,
838
+ "frac_reward_zero_std": 0.0,
839
+ "grad_norm": 0.5690914392471313,
840
+ "kl": 0.0009403228759765625,
841
+ "learning_rate": 5.342952264838747e-07,
842
+ "loss": 0.4887,
843
+ "num_tokens": 4094895.0,
844
+ "policy_entropy_avg": 8.135416666666666,
845
+ "reward": 0.017309244722127914,
846
+ "reward_std": 0.44446587562561035,
847
+ "rewards/cosine_scaled_reward/mean": -0.10126852989196777,
848
+ "rewards/cosine_scaled_reward/std": 0.3542354702949524,
849
+ "rewards/format_reward/mean": 0.2291666716337204,
850
+ "rewards/format_reward/std": 0.4247443675994873,
851
+ "step": 29
852
+ },
853
+ {
854
+ "clip_ratio/high_max": 0.0,
855
+ "clip_ratio/high_mean": 0.0,
856
+ "clip_ratio/low_mean": 0.0,
857
+ "clip_ratio/low_min": 0.0,
858
+ "clip_ratio/region_mean": 0.0,
859
+ "completions/clipped_ratio": 0.5833333333333333,
860
+ "completions/max_length": 3584.0,
861
+ "completions/max_terminated_length": 3493.0,
862
+ "completions/mean_length": 2867.52099609375,
863
+ "completions/mean_terminated_length": 1864.4500732421875,
864
+ "completions/min_length": 716.0,
865
+ "completions/min_terminated_length": 716.0,
866
+ "epoch": 0.03428571428571429,
867
+ "frac_reward_zero_std": 0.0,
868
+ "grad_norm": 1.0161747932434082,
869
+ "kl": 0.0011774698893229167,
870
+ "learning_rate": 5.02962191529556e-07,
871
+ "loss": 0.2395,
872
+ "num_tokens": 4240438.0,
873
+ "policy_entropy_avg": 8.125,
874
+ "reward": 0.1694916933774948,
875
+ "reward_std": 0.6536720395088196,
876
+ "rewards/cosine_scaled_reward/mean": -0.09878844022750854,
877
+ "rewards/cosine_scaled_reward/std": 0.39795705676078796,
878
+ "rewards/format_reward/mean": 0.4583333432674408,
879
+ "rewards/format_reward/std": 0.5035336017608643,
880
+ "step": 30
881
+ },
882
+ {
883
+ "clip_ratio/high_max": 0.0,
884
+ "clip_ratio/high_mean": 0.0,
885
+ "clip_ratio/low_mean": 0.0,
886
+ "clip_ratio/low_min": 0.0,
887
+ "clip_ratio/region_mean": 0.0,
888
+ "completions/clipped_ratio": 0.7291666666666667,
889
+ "completions/max_length": 3584.0,
890
+ "completions/max_terminated_length": 3384.0,
891
+ "completions/mean_length": 2928.875,
892
+ "completions/mean_terminated_length": 1165.0770263671875,
893
+ "completions/min_length": 271.0,
894
+ "completions/min_terminated_length": 271.0,
895
+ "epoch": 0.03542857142857143,
896
+ "frac_reward_zero_std": 0.0,
897
+ "grad_norm": 1.1587620973587036,
898
+ "kl": 0.000823974609375,
899
+ "learning_rate": 4.7185832004988133e-07,
900
+ "loss": 0.1645,
901
+ "num_tokens": 4388896.0,
902
+ "policy_entropy_avg": 8.125,
903
+ "reward": 0.0713512971997261,
904
+ "reward_std": 0.43281668424606323,
905
+ "rewards/cosine_scaled_reward/mean": -0.0909477099776268,
906
+ "rewards/cosine_scaled_reward/std": 0.3611638844013214,
907
+ "rewards/format_reward/mean": 0.2916666567325592,
908
+ "rewards/format_reward/std": 0.4593396782875061,
909
+ "step": 31
910
+ },
911
+ {
912
+ "clip_ratio/high_max": 0.0,
913
+ "clip_ratio/high_mean": 0.0,
914
+ "clip_ratio/low_mean": 0.0,
915
+ "clip_ratio/low_min": 0.0,
916
+ "clip_ratio/region_mean": 0.0,
917
+ "completions/clipped_ratio": 0.6041666666666667,
918
+ "completions/max_length": 3584.0,
919
+ "completions/max_terminated_length": 2794.0,
920
+ "completions/mean_length": 2802.9375,
921
+ "completions/mean_terminated_length": 1610.7894287109375,
922
+ "completions/min_length": 529.0,
923
+ "completions/min_terminated_length": 529.0,
924
+ "epoch": 0.036571428571428574,
925
+ "frac_reward_zero_std": 0.0,
926
+ "grad_norm": 0.7285576462745667,
927
+ "kl": 0.0011965433756510417,
928
+ "learning_rate": 4.4113514698014953e-07,
929
+ "loss": 0.5699,
930
+ "num_tokens": 4531201.0,
931
+ "policy_entropy_avg": 8.125,
932
+ "reward": 0.13415873050689697,
933
+ "reward_std": 0.4497126042842865,
934
+ "rewards/cosine_scaled_reward/mean": -0.09471765905618668,
935
+ "rewards/cosine_scaled_reward/std": 0.3660888373851776,
936
+ "rewards/format_reward/mean": 0.3958333432674408,
937
+ "rewards/format_reward/std": 0.49420398473739624,
938
+ "step": 32
939
+ },
940
+ {
941
+ "clip_ratio/high_max": 0.0,
942
+ "clip_ratio/high_mean": 0.0,
943
+ "clip_ratio/low_mean": 0.0,
944
+ "clip_ratio/low_min": 0.0,
945
+ "clip_ratio/region_mean": 0.0,
946
+ "completions/clipped_ratio": 0.6666666666666667,
947
+ "completions/max_length": 3584.0,
948
+ "completions/max_terminated_length": 3527.0,
949
+ "completions/mean_length": 3081.33349609375,
950
+ "completions/mean_terminated_length": 2076.0,
951
+ "completions/min_length": 765.0,
952
+ "completions/min_terminated_length": 765.0,
953
+ "epoch": 0.037714285714285714,
954
+ "frac_reward_zero_std": 0.0,
955
+ "grad_norm": 0.5806891322135925,
956
+ "kl": 0.0007527669270833334,
957
+ "learning_rate": 4.1094235253127374e-07,
958
+ "loss": -0.155,
959
+ "num_tokens": 4687517.0,
960
+ "policy_entropy_avg": 8.125,
961
+ "reward": 0.052633434534072876,
962
+ "reward_std": 0.5880012512207031,
963
+ "rewards/cosine_scaled_reward/mean": -0.14701275527477264,
964
+ "rewards/cosine_scaled_reward/std": 0.3462415635585785,
965
+ "rewards/format_reward/mean": 0.375,
966
+ "rewards/format_reward/std": 0.48924607038497925,
967
+ "step": 33
968
+ },
969
+ {
970
+ "clip_ratio/high_max": 0.0,
971
+ "clip_ratio/high_mean": 0.0,
972
+ "clip_ratio/low_mean": 0.0,
973
+ "clip_ratio/low_min": 0.0,
974
+ "clip_ratio/region_mean": 0.0,
975
+ "completions/clipped_ratio": 0.41666666666666663,
976
+ "completions/max_length": 3584.0,
977
+ "completions/max_terminated_length": 3110.0,
978
+ "completions/mean_length": 2213.104248046875,
979
+ "completions/mean_terminated_length": 1233.8929443359375,
980
+ "completions/min_length": 582.0,
981
+ "completions/min_terminated_length": 582.0,
982
+ "epoch": 0.038857142857142854,
983
+ "frac_reward_zero_std": 0.0,
984
+ "grad_norm": 1.0047067403793335,
985
+ "kl": 0.0015347798665364583,
986
+ "learning_rate": 3.8142703296283953e-07,
987
+ "loss": 0.6987,
988
+ "num_tokens": 4800910.0,
989
+ "policy_entropy_avg": 8.125,
990
+ "reward": 0.43960699439048767,
991
+ "reward_std": 0.5042138695716858,
992
+ "rewards/cosine_scaled_reward/mean": 0.025659168139100075,
993
+ "rewards/cosine_scaled_reward/std": 0.5110981464385986,
994
+ "rewards/format_reward/mean": 0.625,
995
+ "rewards/format_reward/std": 0.48924607038497925,
996
+ "step": 34
997
+ },
998
+ {
999
+ "clip_ratio/high_max": 0.0,
1000
+ "clip_ratio/high_mean": 0.0,
1001
+ "clip_ratio/low_mean": 0.0,
1002
+ "clip_ratio/low_min": 0.0,
1003
+ "clip_ratio/region_mean": 0.0,
1004
+ "completions/clipped_ratio": 0.5833333333333333,
1005
+ "completions/max_length": 3584.0,
1006
+ "completions/max_terminated_length": 3482.0,
1007
+ "completions/mean_length": 2852.125,
1008
+ "completions/mean_terminated_length": 1827.5,
1009
+ "completions/min_length": 369.0,
1010
+ "completions/min_terminated_length": 369.0,
1011
+ "epoch": 0.04,
1012
+ "frac_reward_zero_std": 0.0,
1013
+ "grad_norm": 0.6956940293312073,
1014
+ "kl": 0.0008710225423177084,
1015
+ "learning_rate": 3.5273298394491515e-07,
1016
+ "loss": 0.023,
1017
+ "num_tokens": 4946434.0,
1018
+ "policy_entropy_avg": 8.125,
1019
+ "reward": 0.27234378457069397,
1020
+ "reward_std": 0.6467978954315186,
1021
+ "rewards/cosine_scaled_reward/mean": -0.030088132247328758,
1022
+ "rewards/cosine_scaled_reward/std": 0.4617981016635895,
1023
+ "rewards/format_reward/mean": 0.4791666567325592,
1024
+ "rewards/format_reward/std": 0.5048523545265198,
1025
+ "step": 35
1026
+ },
1027
+ {
1028
+ "clip_ratio/high_max": 0.0,
1029
+ "clip_ratio/high_mean": 0.0,
1030
+ "clip_ratio/low_mean": 0.0,
1031
+ "clip_ratio/low_min": 0.0,
1032
+ "clip_ratio/region_mean": 0.0,
1033
+ "completions/clipped_ratio": 0.7916666666666666,
1034
+ "completions/max_length": 3584.0,
1035
+ "completions/max_terminated_length": 3373.0,
1036
+ "completions/mean_length": 3279.95849609375,
1037
+ "completions/mean_terminated_length": 2124.60009765625,
1038
+ "completions/min_length": 920.0,
1039
+ "completions/min_terminated_length": 920.0,
1040
+ "epoch": 0.04114285714285714,
1041
+ "frac_reward_zero_std": 0.0,
1042
+ "grad_norm": 0.6212595105171204,
1043
+ "kl": 0.0012715657552083333,
1044
+ "learning_rate": 3.250000000000001e-07,
1045
+ "loss": 0.6297,
1046
+ "num_tokens": 5112206.0,
1047
+ "policy_entropy_avg": 8.135416666666666,
1048
+ "reward": -0.14957016706466675,
1049
+ "reward_std": 0.34586209058761597,
1050
+ "rewards/cosine_scaled_reward/mean": -0.2400539517402649,
1051
+ "rewards/cosine_scaled_reward/std": 0.1841082125902176,
1052
+ "rewards/format_reward/mean": 0.25,
1053
+ "rewards/format_reward/std": 0.4375949800014496,
1054
+ "step": 36
1055
+ },
1056
+ {
1057
+ "clip_ratio/high_max": 0.0,
1058
+ "clip_ratio/high_mean": 0.0,
1059
+ "clip_ratio/low_mean": 0.0,
1060
+ "clip_ratio/low_min": 0.0,
1061
+ "clip_ratio/region_mean": 0.0,
1062
+ "completions/clipped_ratio": 0.75,
1063
+ "completions/max_length": 3584.0,
1064
+ "completions/max_terminated_length": 3444.0,
1065
+ "completions/mean_length": 3138.104248046875,
1066
+ "completions/mean_terminated_length": 1800.416748046875,
1067
+ "completions/min_length": 474.0,
1068
+ "completions/min_terminated_length": 474.0,
1069
+ "epoch": 0.04228571428571429,
1070
+ "frac_reward_zero_std": 0.0,
1071
+ "grad_norm": 0.7584943771362305,
1072
+ "kl": 0.0009466807047526041,
1073
+ "learning_rate": 2.9836319343816397e-07,
1074
+ "loss": 1.1086,
1075
+ "num_tokens": 5271103.0,
1076
+ "policy_entropy_avg": 8.135416666666666,
1077
+ "reward": -0.1891360729932785,
1078
+ "reward_std": 0.35265272855758667,
1079
+ "rewards/cosine_scaled_reward/mean": -0.27048927545547485,
1080
+ "rewards/cosine_scaled_reward/std": 0.19963285326957703,
1081
+ "rewards/format_reward/mean": 0.25,
1082
+ "rewards/format_reward/std": 0.4375949800014496,
1083
+ "step": 37
1084
+ },
1085
+ {
1086
+ "clip_ratio/high_max": 0.0,
1087
+ "clip_ratio/high_mean": 0.0,
1088
+ "clip_ratio/low_mean": 0.0,
1089
+ "clip_ratio/low_min": 0.0,
1090
+ "clip_ratio/region_mean": 0.0,
1091
+ "completions/clipped_ratio": 0.8333333333333334,
1092
+ "completions/max_length": 3584.0,
1093
+ "completions/max_terminated_length": 2923.0,
1094
+ "completions/mean_length": 3194.5,
1095
+ "completions/mean_terminated_length": 1247.0,
1096
+ "completions/min_length": 478.0,
1097
+ "completions/min_terminated_length": 478.0,
1098
+ "epoch": 0.04342857142857143,
1099
+ "frac_reward_zero_std": 0.0,
1100
+ "grad_norm": 0.9534838795661926,
1101
+ "kl": 0.0009625752766927084,
1102
+ "learning_rate": 2.729523361034538e-07,
1103
+ "loss": 0.6428,
1104
+ "num_tokens": 5433223.0,
1105
+ "policy_entropy_avg": 8.125,
1106
+ "reward": -0.03065665066242218,
1107
+ "reward_std": 0.27790385484695435,
1108
+ "rewards/cosine_scaled_reward/mean": -0.10691537708044052,
1109
+ "rewards/cosine_scaled_reward/std": 0.2967289388179779,
1110
+ "rewards/format_reward/mean": 0.1666666716337204,
1111
+ "rewards/format_reward/std": 0.3766217827796936,
1112
+ "step": 38
1113
+ },
1114
+ {
1115
+ "clip_ratio/high_max": 0.0,
1116
+ "clip_ratio/high_mean": 0.0,
1117
+ "clip_ratio/low_mean": 0.0,
1118
+ "clip_ratio/low_min": 0.0,
1119
+ "clip_ratio/region_mean": 0.0,
1120
+ "completions/clipped_ratio": 0.5416666666666667,
1121
+ "completions/max_length": 3584.0,
1122
+ "completions/max_terminated_length": 3191.0,
1123
+ "completions/mean_length": 2574.58349609375,
1124
+ "completions/mean_terminated_length": 1381.6363525390625,
1125
+ "completions/min_length": 326.0,
1126
+ "completions/min_terminated_length": 326.0,
1127
+ "epoch": 0.044571428571428574,
1128
+ "frac_reward_zero_std": 0.0,
1129
+ "grad_norm": 38.30278396606445,
1130
+ "kl": 0.0010172526041666667,
1131
+ "learning_rate": 2.488912271385139e-07,
1132
+ "loss": -21.1377,
1133
+ "num_tokens": 5564891.0,
1134
+ "policy_entropy_avg": 8.125,
1135
+ "reward": 0.24889466166496277,
1136
+ "reward_std": 0.4223267734050751,
1137
+ "rewards/cosine_scaled_reward/mean": -0.03770924732089043,
1138
+ "rewards/cosine_scaled_reward/std": 0.4333648681640625,
1139
+ "rewards/format_reward/mean": 0.4583333432674408,
1140
+ "rewards/format_reward/std": 0.5035336017608643,
1141
+ "step": 39
1142
+ },
1143
+ {
1144
+ "clip_ratio/high_max": 0.0,
1145
+ "clip_ratio/high_mean": 0.0,
1146
+ "clip_ratio/low_mean": 0.0,
1147
+ "clip_ratio/low_min": 0.0,
1148
+ "clip_ratio/region_mean": 0.0,
1149
+ "completions/clipped_ratio": 0.5,
1150
+ "completions/max_length": 3584.0,
1151
+ "completions/max_terminated_length": 3383.0,
1152
+ "completions/mean_length": 2522.6875,
1153
+ "completions/mean_terminated_length": 1461.375,
1154
+ "completions/min_length": 430.0,
1155
+ "completions/min_terminated_length": 430.0,
1156
+ "epoch": 0.045714285714285714,
1157
+ "frac_reward_zero_std": 0.0,
1158
+ "grad_norm": 0.7990902662277222,
1159
+ "kl": 0.0011386871337890625,
1160
+ "learning_rate": 2.2629708984760706e-07,
1161
+ "loss": -0.3002,
1162
+ "num_tokens": 5693534.0,
1163
+ "policy_entropy_avg": 8.125,
1164
+ "reward": 0.23835521936416626,
1165
+ "reward_std": 0.3610273599624634,
1166
+ "rewards/cosine_scaled_reward/mean": -0.07706651836633682,
1167
+ "rewards/cosine_scaled_reward/std": 0.39551421999931335,
1168
+ "rewards/format_reward/mean": 0.5208333134651184,
1169
+ "rewards/format_reward/std": 0.5048523545265198,
1170
+ "step": 40
1171
+ },
1172
+ {
1173
+ "clip_ratio/high_max": 0.0,
1174
+ "clip_ratio/high_mean": 0.0,
1175
+ "clip_ratio/low_mean": 0.0,
1176
+ "clip_ratio/low_min": 0.0,
1177
+ "clip_ratio/region_mean": 0.0,
1178
+ "completions/clipped_ratio": 0.5,
1179
+ "completions/max_length": 3584.0,
1180
+ "completions/max_terminated_length": 3214.0,
1181
+ "completions/mean_length": 2528.33349609375,
1182
+ "completions/mean_terminated_length": 1472.666748046875,
1183
+ "completions/min_length": 547.0,
1184
+ "completions/min_terminated_length": 547.0,
1185
+ "epoch": 0.046857142857142854,
1186
+ "frac_reward_zero_std": 0.0,
1187
+ "grad_norm": 1.0874838829040527,
1188
+ "kl": 0.001148223876953125,
1189
+ "learning_rate": 2.0528000059645995e-07,
1190
+ "loss": -0.3434,
1191
+ "num_tokens": 5823258.0,
1192
+ "policy_entropy_avg": 8.125,
1193
+ "reward": 0.3574024438858032,
1194
+ "reward_std": 0.6161948442459106,
1195
+ "rewards/cosine_scaled_reward/mean": 0.01450828742235899,
1196
+ "rewards/cosine_scaled_reward/std": 0.46221035718917847,
1197
+ "rewards/format_reward/mean": 0.5208333134651184,
1198
+ "rewards/format_reward/std": 0.5048523545265198,
1199
+ "step": 41
1200
+ },
1201
+ {
1202
+ "clip_ratio/high_max": 0.0,
1203
+ "clip_ratio/high_mean": 0.0,
1204
+ "clip_ratio/low_mean": 0.0,
1205
+ "clip_ratio/low_min": 0.0,
1206
+ "clip_ratio/region_mean": 0.0,
1207
+ "completions/clipped_ratio": 0.6458333333333333,
1208
+ "completions/max_length": 3584.0,
1209
+ "completions/max_terminated_length": 2705.0,
1210
+ "completions/mean_length": 2656.416748046875,
1211
+ "completions/mean_terminated_length": 964.941162109375,
1212
+ "completions/min_length": 209.0,
1213
+ "completions/min_terminated_length": 209.0,
1214
+ "epoch": 0.048,
1215
+ "frac_reward_zero_std": 0.0,
1216
+ "grad_norm": 1.283677339553833,
1217
+ "kl": 0.0011049906412760417,
1218
+ "learning_rate": 1.8594235253127372e-07,
1219
+ "loss": 0.27,
1220
+ "num_tokens": 5958536.0,
1221
+ "policy_entropy_avg": 8.125,
1222
+ "reward": -0.12885941565036774,
1223
+ "reward_std": 0.28405821323394775,
1224
+ "rewards/cosine_scaled_reward/mean": -0.2866226136684418,
1225
+ "rewards/cosine_scaled_reward/std": 0.1905842125415802,
1226
+ "rewards/format_reward/mean": 0.375,
1227
+ "rewards/format_reward/std": 0.48924607038497925,
1228
+ "step": 42
1229
+ },
1230
+ {
1231
+ "clip_ratio/high_max": 0.0,
1232
+ "clip_ratio/high_mean": 0.0,
1233
+ "clip_ratio/low_mean": 0.0,
1234
+ "clip_ratio/low_min": 0.0,
1235
+ "clip_ratio/region_mean": 0.0,
1236
+ "completions/clipped_ratio": 0.5833333333333333,
1237
+ "completions/max_length": 3584.0,
1238
+ "completions/max_terminated_length": 2993.0,
1239
+ "completions/mean_length": 2602.354248046875,
1240
+ "completions/mean_terminated_length": 1228.050048828125,
1241
+ "completions/min_length": 586.0,
1242
+ "completions/min_terminated_length": 586.0,
1243
+ "epoch": 0.04914285714285714,
1244
+ "frac_reward_zero_std": 0.0,
1245
+ "grad_norm": 0.7168472409248352,
1246
+ "kl": 0.0010967254638671875,
1247
+ "learning_rate": 1.6837835672960831e-07,
1248
+ "loss": -0.0951,
1249
+ "num_tokens": 6092161.0,
1250
+ "policy_entropy_avg": 8.114583333333334,
1251
+ "reward": 0.21493911743164062,
1252
+ "reward_std": 0.4637143015861511,
1253
+ "rewards/cosine_scaled_reward/mean": -0.07424557209014893,
1254
+ "rewards/cosine_scaled_reward/std": 0.43746232986450195,
1255
+ "rewards/format_reward/mean": 0.4791666567325592,
1256
+ "rewards/format_reward/std": 0.5048523545265198,
1257
+ "step": 43
1258
+ },
1259
+ {
1260
+ "clip_ratio/high_max": 0.0,
1261
+ "clip_ratio/high_mean": 0.0,
1262
+ "clip_ratio/low_mean": 0.0,
1263
+ "clip_ratio/low_min": 0.0,
1264
+ "clip_ratio/region_mean": 0.0,
1265
+ "completions/clipped_ratio": 0.5833333333333333,
1266
+ "completions/max_length": 3584.0,
1267
+ "completions/max_terminated_length": 3087.0,
1268
+ "completions/mean_length": 2491.875,
1269
+ "completions/mean_terminated_length": 962.9000244140625,
1270
+ "completions/min_length": 340.0,
1271
+ "completions/min_terminated_length": 340.0,
1272
+ "epoch": 0.05028571428571429,
1273
+ "frac_reward_zero_std": 0.0,
1274
+ "grad_norm": 0.7064040303230286,
1275
+ "kl": 0.0012423197428385417,
1276
+ "learning_rate": 1.5267358321348285e-07,
1277
+ "loss": 0.4683,
1278
+ "num_tokens": 6219793.0,
1279
+ "policy_entropy_avg": 8.125,
1280
+ "reward": 0.2282370775938034,
1281
+ "reward_std": 0.4510888457298279,
1282
+ "rewards/cosine_scaled_reward/mean": -0.04318302869796753,
1283
+ "rewards/cosine_scaled_reward/std": 0.4827073812484741,
1284
+ "rewards/format_reward/mean": 0.4375,
1285
+ "rewards/format_reward/std": 0.5013279914855957,
1286
+ "step": 44
1287
+ },
1288
+ {
1289
+ "clip_ratio/high_max": 0.0,
1290
+ "clip_ratio/high_mean": 0.0,
1291
+ "clip_ratio/low_mean": 0.0,
1292
+ "clip_ratio/low_min": 0.0,
1293
+ "clip_ratio/region_mean": 0.0,
1294
+ "completions/clipped_ratio": 0.8125,
1295
+ "completions/max_length": 3584.0,
1296
+ "completions/max_terminated_length": 3524.0,
1297
+ "completions/mean_length": 3407.64599609375,
1298
+ "completions/mean_terminated_length": 2643.444580078125,
1299
+ "completions/min_length": 1086.0,
1300
+ "completions/min_terminated_length": 1086.0,
1301
+ "epoch": 0.05142857142857143,
1302
+ "frac_reward_zero_std": 0.0,
1303
+ "grad_norm": 0.6878222227096558,
1304
+ "kl": 0.0010045369466145833,
1305
+ "learning_rate": 1.3890454406082956e-07,
1306
+ "loss": 0.5395,
1307
+ "num_tokens": 6392102.0,
1308
+ "policy_entropy_avg": 8.135416666666666,
1309
+ "reward": 0.18636834621429443,
1310
+ "reward_std": 0.656842827796936,
1311
+ "rewards/cosine_scaled_reward/mean": -0.012889747507870197,
1312
+ "rewards/cosine_scaled_reward/std": 0.47439250349998474,
1313
+ "rewards/format_reward/mean": 0.3125,
1314
+ "rewards/format_reward/std": 0.4684174358844757,
1315
+ "step": 45
1316
+ },
1317
+ {
1318
+ "clip_ratio/high_max": 0.0,
1319
+ "clip_ratio/high_mean": 0.0,
1320
+ "clip_ratio/low_mean": 0.0,
1321
+ "clip_ratio/low_min": 0.0,
1322
+ "clip_ratio/region_mean": 0.0,
1323
+ "completions/clipped_ratio": 0.6666666666666667,
1324
+ "completions/max_length": 3584.0,
1325
+ "completions/max_terminated_length": 1561.0,
1326
+ "completions/mean_length": 2694.75,
1327
+ "completions/mean_terminated_length": 916.25,
1328
+ "completions/min_length": 596.0,
1329
+ "completions/min_terminated_length": 596.0,
1330
+ "epoch": 0.052571428571428575,
1331
+ "frac_reward_zero_std": 0.0,
1332
+ "grad_norm": 1.1607235670089722,
1333
+ "kl": 0.0015538533528645833,
1334
+ "learning_rate": 1.2713832064634125e-07,
1335
+ "loss": -0.0955,
1336
+ "num_tokens": 6529826.0,
1337
+ "policy_entropy_avg": 8.125,
1338
+ "reward": -0.13178138434886932,
1339
+ "reward_std": 0.2914193272590637,
1340
+ "rewards/cosine_scaled_reward/mean": -0.26803696155548096,
1341
+ "rewards/cosine_scaled_reward/std": 0.174865260720253,
1342
+ "rewards/format_reward/mean": 0.3333333432674408,
1343
+ "rewards/format_reward/std": 0.47639307379722595,
1344
+ "step": 46
1345
+ },
1346
+ {
1347
+ "clip_ratio/high_max": 0.0,
1348
+ "clip_ratio/high_mean": 0.0,
1349
+ "clip_ratio/low_mean": 0.0,
1350
+ "clip_ratio/low_min": 0.0,
1351
+ "clip_ratio/region_mean": 0.0,
1352
+ "completions/clipped_ratio": 0.45833333333333337,
1353
+ "completions/max_length": 3584.0,
1354
+ "completions/max_terminated_length": 3485.0,
1355
+ "completions/mean_length": 2533.6875,
1356
+ "completions/mean_terminated_length": 1644.9615478515625,
1357
+ "completions/min_length": 296.0,
1358
+ "completions/min_terminated_length": 296.0,
1359
+ "epoch": 0.053714285714285714,
1360
+ "frac_reward_zero_std": 0.0,
1361
+ "grad_norm": 2.012225389480591,
1362
+ "kl": 0.000888824462890625,
1363
+ "learning_rate": 1.1743223682775649e-07,
1364
+ "loss": -1.3532,
1365
+ "num_tokens": 6659243.0,
1366
+ "policy_entropy_avg": 8.125,
1367
+ "reward": 0.5446165800094604,
1368
+ "reward_std": 0.5056490898132324,
1369
+ "rewards/cosine_scaled_reward/mean": 0.11685246229171753,
1370
+ "rewards/cosine_scaled_reward/std": 0.5205204486846924,
1371
+ "rewards/format_reward/mean": 0.6041666865348816,
1372
+ "rewards/format_reward/std": 0.49420401453971863,
1373
+ "step": 47
1374
+ },
1375
+ {
1376
+ "clip_ratio/high_max": 0.0,
1377
+ "clip_ratio/high_mean": 0.0,
1378
+ "clip_ratio/low_mean": 0.0,
1379
+ "clip_ratio/low_min": 0.0,
1380
+ "clip_ratio/region_mean": 0.0,
1381
+ "completions/clipped_ratio": 0.5208333333333333,
1382
+ "completions/max_length": 3584.0,
1383
+ "completions/max_terminated_length": 3350.0,
1384
+ "completions/mean_length": 2581.33349609375,
1385
+ "completions/mean_terminated_length": 1491.478271484375,
1386
+ "completions/min_length": 552.0,
1387
+ "completions/min_terminated_length": 552.0,
1388
+ "epoch": 0.054857142857142854,
1389
+ "frac_reward_zero_std": 0.0,
1390
+ "grad_norm": 0.6057020425796509,
1391
+ "kl": 0.00128936767578125,
1392
+ "learning_rate": 1.0983357966978745e-07,
1393
+ "loss": 0.4498,
1394
+ "num_tokens": 6791055.0,
1395
+ "policy_entropy_avg": 8.125,
1396
+ "reward": 0.21042108535766602,
1397
+ "reward_std": 0.5790350437164307,
1398
+ "rewards/cosine_scaled_reward/mean": -0.10897094756364822,
1399
+ "rewards/cosine_scaled_reward/std": 0.40693506598472595,
1400
+ "rewards/format_reward/mean": 0.5416666865348816,
1401
+ "rewards/format_reward/std": 0.5035336017608643,
1402
+ "step": 48
1403
+ },
1404
+ {
1405
+ "clip_ratio/high_max": 0.0,
1406
+ "clip_ratio/high_mean": 0.0,
1407
+ "clip_ratio/low_mean": 0.0,
1408
+ "clip_ratio/low_min": 0.0,
1409
+ "clip_ratio/region_mean": 0.0,
1410
+ "completions/clipped_ratio": 0.39583333333333337,
1411
+ "completions/max_length": 3584.0,
1412
+ "completions/max_terminated_length": 3352.0,
1413
+ "completions/mean_length": 2207.791748046875,
1414
+ "completions/mean_terminated_length": 1306.137939453125,
1415
+ "completions/min_length": 281.0,
1416
+ "completions/min_terminated_length": 281.0,
1417
+ "epoch": 0.056,
1418
+ "frac_reward_zero_std": 0.0,
1419
+ "grad_norm": 0.9485021233558655,
1420
+ "kl": 0.0012709299723307292,
1421
+ "learning_rate": 1.0437936906629334e-07,
1422
+ "loss": -0.5865,
1423
+ "num_tokens": 6904577.0,
1424
+ "policy_entropy_avg": 8.125,
1425
+ "reward": 0.2760649025440216,
1426
+ "reward_std": 0.5270255208015442,
1427
+ "rewards/cosine_scaled_reward/mean": -0.10014239698648453,
1428
+ "rewards/cosine_scaled_reward/std": 0.42991697788238525,
1429
+ "rewards/format_reward/mean": 0.625,
1430
+ "rewards/format_reward/std": 0.48924607038497925,
1431
+ "step": 49
1432
+ },
1433
+ {
1434
+ "clip_ratio/high_max": 0.0,
1435
+ "clip_ratio/high_mean": 0.0,
1436
+ "clip_ratio/low_mean": 0.0,
1437
+ "clip_ratio/low_min": 0.0,
1438
+ "clip_ratio/region_mean": 0.0,
1439
+ "completions/clipped_ratio": 0.6875,
1440
+ "completions/max_length": 3584.0,
1441
+ "completions/max_terminated_length": 2808.0,
1442
+ "completions/mean_length": 2786.791748046875,
1443
+ "completions/mean_terminated_length": 1032.933349609375,
1444
+ "completions/min_length": 358.0,
1445
+ "completions/min_terminated_length": 358.0,
1446
+ "epoch": 0.05714285714285714,
1447
+ "frac_reward_zero_std": 0.0,
1448
+ "grad_norm": 0.7447382807731628,
1449
+ "kl": 0.001110076904296875,
1450
+ "learning_rate": 1.0109617738307911e-07,
1451
+ "loss": -0.2738,
1452
+ "num_tokens": 7046455.0,
1453
+ "policy_entropy_avg": 8.135416666666666,
1454
+ "reward": 0.2738919258117676,
1455
+ "reward_std": 0.44107958674430847,
1456
+ "rewards/cosine_scaled_reward/mean": 0.023186095058918,
1457
+ "rewards/cosine_scaled_reward/std": 0.3980216383934021,
1458
+ "rewards/format_reward/mean": 0.375,
1459
+ "rewards/format_reward/std": 0.48924607038497925,
1460
+ "step": 50
1461
+ },
1462
+ {
1463
+ "epoch": 0.05714285714285714,
1464
+ "step": 50,
1465
+ "total_flos": 0.0,
1466
+ "train_loss": -0.45832799572497607,
1467
+ "train_runtime": 4441.2019,
1468
+ "train_samples_per_second": 0.54,
1469
+ "train_steps_per_second": 0.011
1470
+ }
1471
+ ],
1472
+ "logging_steps": 1,
1473
+ "max_steps": 50,
1474
+ "num_input_tokens_seen": 7046455,
1475
+ "num_train_epochs": 1,
1476
+ "save_steps": 50,
1477
+ "stateful_callbacks": {
1478
+ "TrainerControl": {
1479
+ "args": {
1480
+ "should_epoch_stop": false,
1481
+ "should_evaluate": false,
1482
+ "should_log": false,
1483
+ "should_save": true,
1484
+ "should_training_stop": true
1485
+ },
1486
+ "attributes": {}
1487
+ }
1488
+ },
1489
+ "total_flos": 0.0,
1490
+ "train_batch_size": 4,
1491
+ "trial_name": null,
1492
+ "trial_params": null
1493
+ }