Maker-0409 commited on
Commit
b1930f0
·
verified ·
1 Parent(s): 75605cd

Model save

Browse files
README.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-Math-7B
3
+ library_name: transformers
4
+ model_name: Qwen-2.5-7B-Simple-RL
5
+ tags:
6
+ - generated_from_trainer
7
+ - trl
8
+ - grpo
9
+ licence: license
10
+ ---
11
+
12
+ # Model Card for Qwen-2.5-7B-Simple-RL
13
+
14
+ This model is a fine-tuned version of [Qwen/Qwen2.5-Math-7B](https://huggingface.co/Qwen/Qwen2.5-Math-7B).
15
+ It has been trained using [TRL](https://github.com/huggingface/trl).
16
+
17
+ ## Quick start
18
+
19
+ ```python
20
+ from transformers import pipeline
21
+
22
+ question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?"
23
+ generator = pipeline("text-generation", model="Maker-0409/Qwen-2.5-7B-Simple-RL", device="cuda")
24
+ output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0]
25
+ print(output["generated_text"])
26
+ ```
27
+
28
+ ## Training procedure
29
+
30
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/taochenai/huggingface/runs/hihrk4wo)
31
+
32
+
33
+ This model was trained with GRPO, a method introduced in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300).
34
+
35
+ ### Framework versions
36
+
37
+ - TRL: 0.15.0.dev0
38
+ - Transformers: 4.49.0.dev0
39
+ - Pytorch: 2.5.1+cu121
40
+ - Datasets: 3.2.0
41
+ - Tokenizers: 0.21.0
42
+
43
+ ## Citations
44
+
45
+ Cite GRPO as:
46
+
47
+ ```bibtex
48
+ @article{zhihong2024deepseekmath,
49
+ title = {{DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models}},
50
+ author = {Zhihong Shao and Peiyi Wang and Qihao Zhu and Runxin Xu and Junxiao Song and Mingchuan Zhang and Y. K. Li and Y. Wu and Daya Guo},
51
+ year = 2024,
52
+ eprint = {arXiv:2402.03300},
53
+ }
54
+
55
+ ```
56
+
57
+ Cite TRL as:
58
+
59
+ ```bibtex
60
+ @misc{vonwerra2022trl,
61
+ title = {{TRL: Transformer Reinforcement Learning}},
62
+ author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang and Kashif Rasul and Quentin Gallouédec},
63
+ year = 2020,
64
+ journal = {GitHub repository},
65
+ publisher = {GitHub},
66
+ howpublished = {\url{https://github.com/huggingface/trl}}
67
+ }
68
+ ```
all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 4841.422249500714,
4
+ "train_runtime": 180396.3107,
5
+ "train_samples": 7500,
6
+ "train_samples_per_second": 0.042,
7
+ "train_steps_per_second": 0.003
8
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "max_new_tokens": 2048,
5
+ "transformers_version": "4.49.0.dev0"
6
+ }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:988336cece8f0bddbadb4ddc1af32578d73546bbcbee61322797acc652909568
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c29d9eecbf30eef037859879dbc6af4acc10a3ff8fb79cb4732cbca41c35fbe3
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3bf9fc7e0cdb7f18431de4c1c4bd7242a4aa1489da083210e3b5604aecc5bfb
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab9b09035bebd0c729f8bbd320c3152f62583df612cf7faaac3eac466ae6557c
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:697b20a7db1670a093afc92cf8d3a3f8000f17dd201d3d115115282b86963d4f
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2c78dec2dd3327d10869c4da125411e4e739412f00b94cdc15580e698d9ac77
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a489988de3cb1566b1845d3ab7bfcdd065744b45f0a4e459444bfa2ca361fc49
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d39c18161aa93e39851678ad03057059b91c440ffec216018c112758dcec9a8
3
  size 1089994880
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "total_flos": 0.0,
3
+ "train_loss": 4841.422249500714,
4
+ "train_runtime": 180396.3107,
5
+ "train_samples": 7500,
6
+ "train_samples_per_second": 0.042,
7
+ "train_steps_per_second": 0.003
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,1509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 0.9984,
5
+ "eval_steps": 100,
6
+ "global_step": 468,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "completion_length": 633.2446681976319,
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 2.2443933486938477,
15
+ "kl": 0.00011417865753173828,
16
+ "learning_rate": 3.1914893617021275e-07,
17
+ "loss": 0.0,
18
+ "reward": 1.138736367225647,
19
+ "reward_std": 0.8278621450066567,
20
+ "rewards/accuracy_reward": 0.5946428831666708,
21
+ "rewards/cosine_scaled_reward": 0.2899268216686323,
22
+ "rewards/format_reward": 0.0,
23
+ "rewards/reasoning_steps_reward": 0.25416668243706225,
24
+ "step": 5
25
+ },
26
+ {
27
+ "completion_length": 600.8857383728027,
28
+ "epoch": 0.021333333333333333,
29
+ "grad_norm": 5.001251220703125,
30
+ "kl": 0.00020779371261596679,
31
+ "learning_rate": 6.382978723404255e-07,
32
+ "loss": 0.0,
33
+ "reward": 1.2528822764754295,
34
+ "reward_std": 0.8592379853129387,
35
+ "rewards/accuracy_reward": 0.6553571775555611,
36
+ "rewards/cosine_scaled_reward": 0.34097747248015364,
37
+ "rewards/format_reward": 0.0,
38
+ "rewards/reasoning_steps_reward": 0.25654763616621495,
39
+ "step": 10
40
+ },
41
+ {
42
+ "completion_length": 601.8518112182617,
43
+ "epoch": 0.032,
44
+ "grad_norm": 3.453845500946045,
45
+ "kl": 0.00034580230712890627,
46
+ "learning_rate": 9.574468085106384e-07,
47
+ "loss": 0.0,
48
+ "reward": 1.2825960636138916,
49
+ "reward_std": 0.7762525148689747,
50
+ "rewards/accuracy_reward": 0.6642857484519482,
51
+ "rewards/cosine_scaled_reward": 0.3486674582702108,
52
+ "rewards/format_reward": 0.0,
53
+ "rewards/reasoning_steps_reward": 0.26964287189766767,
54
+ "step": 15
55
+ },
56
+ {
57
+ "completion_length": 620.7839553833007,
58
+ "epoch": 0.042666666666666665,
59
+ "grad_norm": 63.01131057739258,
60
+ "kl": 0.001246500015258789,
61
+ "learning_rate": 1.276595744680851e-06,
62
+ "loss": 0.0001,
63
+ "reward": 1.2914750523865224,
64
+ "reward_std": 0.7945833645761013,
65
+ "rewards/accuracy_reward": 0.6571428865194321,
66
+ "rewards/cosine_scaled_reward": 0.3593321413063677,
67
+ "rewards/format_reward": 0.0,
68
+ "rewards/reasoning_steps_reward": 0.2750000203028321,
69
+ "step": 20
70
+ },
71
+ {
72
+ "completion_length": 639.3946762084961,
73
+ "epoch": 0.05333333333333334,
74
+ "grad_norm": 1.1951252222061157,
75
+ "kl": 0.001938199996948242,
76
+ "learning_rate": 1.5957446808510639e-06,
77
+ "loss": 0.0001,
78
+ "reward": 1.2197763450443744,
79
+ "reward_std": 0.7964548453688621,
80
+ "rewards/accuracy_reward": 0.6285714630037547,
81
+ "rewards/cosine_scaled_reward": 0.323942980915308,
82
+ "rewards/format_reward": 0.0,
83
+ "rewards/reasoning_steps_reward": 0.2672619212418795,
84
+ "step": 25
85
+ },
86
+ {
87
+ "completion_length": 645.9482414245606,
88
+ "epoch": 0.064,
89
+ "grad_norm": 0.5322187542915344,
90
+ "kl": 0.0028698921203613283,
91
+ "learning_rate": 1.9148936170212767e-06,
92
+ "loss": 0.0001,
93
+ "reward": 1.34233574308455,
94
+ "reward_std": 0.7051636058837175,
95
+ "rewards/accuracy_reward": 0.6821428902447224,
96
+ "rewards/cosine_scaled_reward": 0.38400235488079487,
97
+ "rewards/format_reward": 0.0,
98
+ "rewards/reasoning_steps_reward": 0.2761904950253665,
99
+ "step": 30
100
+ },
101
+ {
102
+ "completion_length": 630.1071678161621,
103
+ "epoch": 0.07466666666666667,
104
+ "grad_norm": 0.686019241809845,
105
+ "kl": 0.00424489974975586,
106
+ "learning_rate": 2.2340425531914894e-06,
107
+ "loss": 0.0002,
108
+ "reward": 1.2706220560474322,
109
+ "reward_std": 0.7081292014569044,
110
+ "rewards/accuracy_reward": 0.6839286010712385,
111
+ "rewards/cosine_scaled_reward": 0.34145535016432405,
112
+ "rewards/format_reward": 0.0,
113
+ "rewards/reasoning_steps_reward": 0.2452381114475429,
114
+ "step": 35
115
+ },
116
+ {
117
+ "completion_length": 663.8464553833007,
118
+ "epoch": 0.08533333333333333,
119
+ "grad_norm": 10619385856.0,
120
+ "kl": 11324620.806011772,
121
+ "learning_rate": 2.553191489361702e-06,
122
+ "loss": 453134.65,
123
+ "reward": 1.4818414891138674,
124
+ "reward_std": 0.724718413501978,
125
+ "rewards/accuracy_reward": 0.7196428954601288,
126
+ "rewards/cosine_scaled_reward": 0.43124618427827954,
127
+ "rewards/format_reward": 0.0,
128
+ "rewards/reasoning_steps_reward": 0.3309524044394493,
129
+ "step": 40
130
+ },
131
+ {
132
+ "completion_length": 636.5178840637207,
133
+ "epoch": 0.096,
134
+ "grad_norm": 0.4083445370197296,
135
+ "kl": 0.1388763427734375,
136
+ "learning_rate": 2.872340425531915e-06,
137
+ "loss": 0.0055,
138
+ "reward": 1.5206772923469543,
139
+ "reward_std": 0.6890950493514538,
140
+ "rewards/accuracy_reward": 0.7428571715950966,
141
+ "rewards/cosine_scaled_reward": 0.4444867596961558,
142
+ "rewards/format_reward": 0.0,
143
+ "rewards/reasoning_steps_reward": 0.3333333550952375,
144
+ "step": 45
145
+ },
146
+ {
147
+ "completion_length": 624.0178833007812,
148
+ "epoch": 0.10666666666666667,
149
+ "grad_norm": 0.6491600275039673,
150
+ "kl": 0.014713478088378907,
151
+ "learning_rate": 2.9996241442585123e-06,
152
+ "loss": 0.0006,
153
+ "reward": 1.5073627218604089,
154
+ "reward_std": 0.7132997542619706,
155
+ "rewards/accuracy_reward": 0.712500025331974,
156
+ "rewards/cosine_scaled_reward": 0.41093407664448023,
157
+ "rewards/format_reward": 0.0,
158
+ "rewards/reasoning_steps_reward": 0.38392860516905786,
159
+ "step": 50
160
+ },
161
+ {
162
+ "completion_length": 631.5339569091797,
163
+ "epoch": 0.11733333333333333,
164
+ "grad_norm": 0.7147920727729797,
165
+ "kl": 0.007195663452148437,
166
+ "learning_rate": 2.9973279301399446e-06,
167
+ "loss": 0.0003,
168
+ "reward": 1.5377919152379036,
169
+ "reward_std": 0.76092077344656,
170
+ "rewards/accuracy_reward": 0.7232143200933934,
171
+ "rewards/cosine_scaled_reward": 0.4282680474221706,
172
+ "rewards/format_reward": 0.0,
173
+ "rewards/reasoning_steps_reward": 0.386309552192688,
174
+ "step": 55
175
+ },
176
+ {
177
+ "completion_length": 627.9214561462402,
178
+ "epoch": 0.128,
179
+ "grad_norm": 0.8942143321037292,
180
+ "kl": 0.008642578125,
181
+ "learning_rate": 2.992947502998804e-06,
182
+ "loss": 0.0003,
183
+ "reward": 1.6543699458241463,
184
+ "reward_std": 0.7264986954629421,
185
+ "rewards/accuracy_reward": 0.7214285999536514,
186
+ "rewards/cosine_scaled_reward": 0.40972703909501434,
187
+ "rewards/format_reward": 0.0,
188
+ "rewards/reasoning_steps_reward": 0.5232143249362707,
189
+ "step": 60
190
+ },
191
+ {
192
+ "completion_length": 633.0232421875,
193
+ "epoch": 0.13866666666666666,
194
+ "grad_norm": 6.921348571777344,
195
+ "kl": 0.01439208984375,
196
+ "learning_rate": 2.9864889601923268e-06,
197
+ "loss": 0.0006,
198
+ "reward": 1.7206872910261155,
199
+ "reward_std": 0.7344334974884987,
200
+ "rewards/accuracy_reward": 0.725000036507845,
201
+ "rewards/cosine_scaled_reward": 0.43497296012938025,
202
+ "rewards/format_reward": 0.0,
203
+ "rewards/reasoning_steps_reward": 0.5607143200933933,
204
+ "step": 65
205
+ },
206
+ {
207
+ "completion_length": 656.7178894042969,
208
+ "epoch": 0.14933333333333335,
209
+ "grad_norm": 0.6442045569419861,
210
+ "kl": 0.01673736572265625,
211
+ "learning_rate": 2.977961291721137e-06,
212
+ "loss": 0.0007,
213
+ "reward": 1.8801582887768746,
214
+ "reward_std": 0.7263622097671032,
215
+ "rewards/accuracy_reward": 0.7571428894996644,
216
+ "rewards/cosine_scaled_reward": 0.47301534870639445,
217
+ "rewards/format_reward": 0.0,
218
+ "rewards/reasoning_steps_reward": 0.6500000521540642,
219
+ "step": 70
220
+ },
221
+ {
222
+ "completion_length": 619.4536033630371,
223
+ "epoch": 0.16,
224
+ "grad_norm": 1.7239394187927246,
225
+ "kl": 0.026496124267578126,
226
+ "learning_rate": 2.9673763677155655e-06,
227
+ "loss": 0.0011,
228
+ "reward": 1.8051109313964844,
229
+ "reward_std": 0.7346500240266323,
230
+ "rewards/accuracy_reward": 0.7160714596509934,
231
+ "rewards/cosine_scaled_reward": 0.39439656864851713,
232
+ "rewards/format_reward": 0.0,
233
+ "rewards/reasoning_steps_reward": 0.6946429140865803,
234
+ "step": 75
235
+ },
236
+ {
237
+ "completion_length": 623.1785926818848,
238
+ "epoch": 0.17066666666666666,
239
+ "grad_norm": 0.6716666221618652,
240
+ "kl": 0.018997955322265624,
241
+ "learning_rate": 2.9547489219129666e-06,
242
+ "loss": 0.0008,
243
+ "reward": 1.9212585434317588,
244
+ "reward_std": 0.634969700500369,
245
+ "rewards/accuracy_reward": 0.7785714574158191,
246
+ "rewards/cosine_scaled_reward": 0.4653060721466318,
247
+ "rewards/format_reward": 0.0,
248
+ "rewards/reasoning_steps_reward": 0.6773809991776943,
249
+ "step": 80
250
+ },
251
+ {
252
+ "completion_length": 690.1518196105957,
253
+ "epoch": 0.18133333333333335,
254
+ "grad_norm": 1.1456305980682373,
255
+ "kl": 0.02204437255859375,
256
+ "learning_rate": 2.9400965311490175e-06,
257
+ "loss": 0.0009,
258
+ "reward": 1.9084690719842912,
259
+ "reward_std": 0.7263222638517618,
260
+ "rewards/accuracy_reward": 0.7303571783006191,
261
+ "rewards/cosine_scaled_reward": 0.4507309086387977,
262
+ "rewards/format_reward": 0.0,
263
+ "rewards/reasoning_steps_reward": 0.7273810178041458,
264
+ "step": 85
265
+ },
266
+ {
267
+ "completion_length": 650.4768188476562,
268
+ "epoch": 0.192,
269
+ "grad_norm": 29.814361572265625,
270
+ "kl": 0.078216552734375,
271
+ "learning_rate": 2.9234395908915565e-06,
272
+ "loss": 0.0031,
273
+ "reward": 1.8972563683986663,
274
+ "reward_std": 0.7165740359574556,
275
+ "rewards/accuracy_reward": 0.6875000324100256,
276
+ "rewards/cosine_scaled_reward": 0.4055896209087223,
277
+ "rewards/format_reward": 0.0,
278
+ "rewards/reasoning_steps_reward": 0.8041667267680168,
279
+ "step": 90
280
+ },
281
+ {
282
+ "completion_length": 668.3339584350585,
283
+ "epoch": 0.20266666666666666,
284
+ "grad_norm": 0.48750847578048706,
285
+ "kl": 0.02767181396484375,
286
+ "learning_rate": 2.904801286851009e-06,
287
+ "loss": 0.0011,
288
+ "reward": 1.9524270623922348,
289
+ "reward_std": 0.6363851364701987,
290
+ "rewards/accuracy_reward": 0.7035714564844966,
291
+ "rewards/cosine_scaled_reward": 0.42206980669870975,
292
+ "rewards/format_reward": 0.0,
293
+ "rewards/reasoning_steps_reward": 0.826785783469677,
294
+ "step": 95
295
+ },
296
+ {
297
+ "completion_length": 645.9428840637207,
298
+ "epoch": 0.21333333333333335,
299
+ "grad_norm": 0.8315287232398987,
300
+ "kl": 0.02986602783203125,
301
+ "learning_rate": 2.884207562706925e-06,
302
+ "loss": 0.0012,
303
+ "reward": 2.0384097367525102,
304
+ "reward_std": 0.6786769151687622,
305
+ "rewards/accuracy_reward": 0.7517857387661934,
306
+ "rewards/cosine_scaled_reward": 0.4657905898289755,
307
+ "rewards/format_reward": 0.0,
308
+ "rewards/reasoning_steps_reward": 0.820833396166563,
309
+ "step": 100
310
+ },
311
+ {
312
+ "epoch": 0.21333333333333335,
313
+ "eval_completion_length": 688.0076597412109,
314
+ "eval_kl": 0.0332870361328125,
315
+ "eval_loss": 0.0013802805915474892,
316
+ "eval_reward": 1.86520801551342,
317
+ "eval_reward_std": 0.7114028903335333,
318
+ "eval_rewards/accuracy_reward": 0.650542886838317,
319
+ "eval_rewards/cosine_scaled_reward": 0.3737031816519331,
320
+ "eval_rewards/format_reward": 0.0,
321
+ "eval_rewards/reasoning_steps_reward": 0.8409619681358338,
322
+ "eval_runtime": 32350.4437,
323
+ "eval_samples_per_second": 0.155,
324
+ "eval_steps_per_second": 0.011,
325
+ "step": 100
326
+ },
327
+ {
328
+ "completion_length": 717.150033569336,
329
+ "epoch": 0.224,
330
+ "grad_norm": 1.5486549139022827,
331
+ "kl": 0.03196563720703125,
332
+ "learning_rate": 2.8616870839955444e-06,
333
+ "loss": 0.0013,
334
+ "reward": 2.0346583992242815,
335
+ "reward_std": 0.7014419212937355,
336
+ "rewards/accuracy_reward": 0.7232143215835094,
337
+ "rewards/cosine_scaled_reward": 0.457277343980968,
338
+ "rewards/format_reward": 0.0,
339
+ "rewards/reasoning_steps_reward": 0.8541667237877846,
340
+ "step": 105
341
+ },
342
+ {
343
+ "completion_length": 708.8571708679199,
344
+ "epoch": 0.23466666666666666,
345
+ "grad_norm": 0.5981384515762329,
346
+ "kl": 0.02979583740234375,
347
+ "learning_rate": 2.837271198208662e-06,
348
+ "loss": 0.0012,
349
+ "reward": 2.0179374665021896,
350
+ "reward_std": 0.6652137346565723,
351
+ "rewards/accuracy_reward": 0.7250000320374965,
352
+ "rewards/cosine_scaled_reward": 0.47091358043253423,
353
+ "rewards/format_reward": 0.0,
354
+ "rewards/reasoning_steps_reward": 0.8220238700509072,
355
+ "step": 110
356
+ },
357
+ {
358
+ "completion_length": 632.7732406616211,
359
+ "epoch": 0.24533333333333332,
360
+ "grad_norm": 0.7111315131187439,
361
+ "kl": 0.02539825439453125,
362
+ "learning_rate": 2.8109938911593322e-06,
363
+ "loss": 0.001,
364
+ "reward": 2.0148118153214454,
365
+ "reward_std": 0.6429756574332715,
366
+ "rewards/accuracy_reward": 0.728571455553174,
367
+ "rewards/cosine_scaled_reward": 0.44754982106387614,
368
+ "rewards/format_reward": 0.0,
369
+ "rewards/reasoning_steps_reward": 0.8386905357241631,
370
+ "step": 115
371
+ },
372
+ {
373
+ "completion_length": 655.8321723937988,
374
+ "epoch": 0.256,
375
+ "grad_norm": 0.5316483974456787,
376
+ "kl": 0.02179107666015625,
377
+ "learning_rate": 2.7828917396751474e-06,
378
+ "loss": 0.0009,
379
+ "reward": 1.9900789648294448,
380
+ "reward_std": 0.6477071691304446,
381
+ "rewards/accuracy_reward": 0.7160714656114578,
382
+ "rewards/cosine_scaled_reward": 0.43412648113444446,
383
+ "rewards/format_reward": 0.0,
384
+ "rewards/reasoning_steps_reward": 0.8398810118436814,
385
+ "step": 120
386
+ },
387
+ {
388
+ "completion_length": 644.7321693420411,
389
+ "epoch": 0.26666666666666666,
390
+ "grad_norm": 0.4458823800086975,
391
+ "kl": 0.025299072265625,
392
+ "learning_rate": 2.753003860684943e-06,
393
+ "loss": 0.001,
394
+ "reward": 2.1427780210971834,
395
+ "reward_std": 0.6711063630878925,
396
+ "rewards/accuracy_reward": 0.7750000268220901,
397
+ "rewards/cosine_scaled_reward": 0.5183731818571686,
398
+ "rewards/format_reward": 0.0,
399
+ "rewards/reasoning_steps_reward": 0.8494048312306404,
400
+ "step": 125
401
+ },
402
+ {
403
+ "completion_length": 684.2911033630371,
404
+ "epoch": 0.2773333333333333,
405
+ "grad_norm": 0.7146270871162415,
406
+ "kl": 0.034222412109375,
407
+ "learning_rate": 2.721371856769793e-06,
408
+ "loss": 0.0014,
409
+ "reward": 1.9814838409423827,
410
+ "reward_std": 0.7353869907557964,
411
+ "rewards/accuracy_reward": 0.6625000331550837,
412
+ "rewards/cosine_scaled_reward": 0.3981504186260281,
413
+ "rewards/format_reward": 0.0,
414
+ "rewards/reasoning_steps_reward": 0.9208333924412727,
415
+ "step": 130
416
+ },
417
+ {
418
+ "completion_length": 650.483960723877,
419
+ "epoch": 0.288,
420
+ "grad_norm": 0.8331003189086914,
421
+ "kl": 0.046978759765625,
422
+ "learning_rate": 2.688039758254093e-06,
423
+ "loss": 0.0019,
424
+ "reward": 2.223627084493637,
425
+ "reward_std": 0.6465678755193949,
426
+ "rewards/accuracy_reward": 0.7732143219560385,
427
+ "rewards/cosine_scaled_reward": 0.506960358901415,
428
+ "rewards/format_reward": 0.0,
429
+ "rewards/reasoning_steps_reward": 0.94345243871212,
430
+ "step": 135
431
+ },
432
+ {
433
+ "completion_length": 702.9536026000976,
434
+ "epoch": 0.2986666666666667,
435
+ "grad_norm": 1.9107334613800049,
436
+ "kl": 0.0536590576171875,
437
+ "learning_rate": 2.65305396191733e-06,
438
+ "loss": 0.0021,
439
+ "reward": 2.1239778250455856,
440
+ "reward_std": 0.6765143848955631,
441
+ "rewards/accuracy_reward": 0.7071428891271353,
442
+ "rewards/cosine_scaled_reward": 0.4555253505706787,
443
+ "rewards/format_reward": 0.0,
444
+ "rewards/reasoning_steps_reward": 0.9613095715641975,
445
+ "step": 140
446
+ },
447
+ {
448
+ "completion_length": 733.6089630126953,
449
+ "epoch": 0.30933333333333335,
450
+ "grad_norm": 0.5300867557525635,
451
+ "kl": 0.05316162109375,
452
+ "learning_rate": 2.61646316641186e-06,
453
+ "loss": 0.0021,
454
+ "reward": 2.1554796636104583,
455
+ "reward_std": 0.6578622825443745,
456
+ "rewards/accuracy_reward": 0.7303571704775095,
457
+ "rewards/cosine_scaled_reward": 0.47036054339259864,
458
+ "rewards/format_reward": 0.0,
459
+ "rewards/reasoning_steps_reward": 0.9547619551420212,
460
+ "step": 145
461
+ },
462
+ {
463
+ "completion_length": 713.221459197998,
464
+ "epoch": 0.32,
465
+ "grad_norm": 0.6026062369346619,
466
+ "kl": 0.0533843994140625,
467
+ "learning_rate": 2.5783183044765715e-06,
468
+ "loss": 0.0021,
469
+ "reward": 2.1126459658145906,
470
+ "reward_std": 0.5920085646212101,
471
+ "rewards/accuracy_reward": 0.7089285995811224,
472
+ "rewards/cosine_scaled_reward": 0.4566935421898961,
473
+ "rewards/format_reward": 0.0,
474
+ "rewards/reasoning_steps_reward": 0.9470238655805587,
475
+ "step": 150
476
+ },
477
+ {
478
+ "completion_length": 678.6428886413574,
479
+ "epoch": 0.33066666666666666,
480
+ "grad_norm": 0.6598377227783203,
481
+ "kl": 0.049908447265625,
482
+ "learning_rate": 2.5386724720408135e-06,
483
+ "loss": 0.002,
484
+ "reward": 2.243595580756664,
485
+ "reward_std": 0.6088640403002501,
486
+ "rewards/accuracy_reward": 0.7767857441678643,
487
+ "rewards/cosine_scaled_reward": 0.5435954930260778,
488
+ "rewards/format_reward": 0.0,
489
+ "rewards/reasoning_steps_reward": 0.9232143476605416,
490
+ "step": 155
491
+ },
492
+ {
493
+ "completion_length": 683.9268142700196,
494
+ "epoch": 0.3413333333333333,
495
+ "grad_norm": 0.6654959321022034,
496
+ "kl": 0.0447540283203125,
497
+ "learning_rate": 2.49758085431725e-06,
498
+ "loss": 0.0018,
499
+ "reward": 2.0952899247407912,
500
+ "reward_std": 0.6968366518616677,
501
+ "rewards/accuracy_reward": 0.7232143208384514,
502
+ "rewards/cosine_scaled_reward": 0.4637422326952219,
503
+ "rewards/format_reward": 0.0,
504
+ "rewards/reasoning_steps_reward": 0.9083333939313889,
505
+ "step": 160
506
+ },
507
+ {
508
+ "completion_length": 691.3464614868165,
509
+ "epoch": 0.352,
510
+ "grad_norm": 0.689552903175354,
511
+ "kl": 0.0448211669921875,
512
+ "learning_rate": 2.455100648986533e-06,
513
+ "loss": 0.0018,
514
+ "reward": 2.0519487097859384,
515
+ "reward_std": 0.7221721112728119,
516
+ "rewards/accuracy_reward": 0.6964286031201482,
517
+ "rewards/cosine_scaled_reward": 0.4602819522842765,
518
+ "rewards/format_reward": 0.0,
519
+ "rewards/reasoning_steps_reward": 0.8952381581068038,
520
+ "step": 165
521
+ },
522
+ {
523
+ "completion_length": 696.5268180847168,
524
+ "epoch": 0.3626666666666667,
525
+ "grad_norm": 1.0024878978729248,
526
+ "kl": 0.065167236328125,
527
+ "learning_rate": 2.4112909865807053e-06,
528
+ "loss": 0.0026,
529
+ "reward": 1.7887505039572715,
530
+ "reward_std": 0.7482936225831509,
531
+ "rewards/accuracy_reward": 0.571428600884974,
532
+ "rewards/cosine_scaled_reward": 0.3333932981360704,
533
+ "rewards/format_reward": 0.0,
534
+ "rewards/reasoning_steps_reward": 0.8839286401867866,
535
+ "step": 170
536
+ },
537
+ {
538
+ "completion_length": 703.2714614868164,
539
+ "epoch": 0.37333333333333335,
540
+ "grad_norm": 0.5711168050765991,
541
+ "kl": 0.093731689453125,
542
+ "learning_rate": 2.366212848176164e-06,
543
+ "loss": 0.0037,
544
+ "reward": 1.9069189459085465,
545
+ "reward_std": 0.8069212771952152,
546
+ "rewards/accuracy_reward": 0.6500000327825546,
547
+ "rewards/cosine_scaled_reward": 0.42358550764620306,
548
+ "rewards/format_reward": 0.0,
549
+ "rewards/reasoning_steps_reward": 0.8333333879709244,
550
+ "step": 175
551
+ },
552
+ {
553
+ "completion_length": 714.2536003112793,
554
+ "epoch": 0.384,
555
+ "grad_norm": 3.1069464683532715,
556
+ "kl": 0.1747802734375,
557
+ "learning_rate": 2.319928980510752e-06,
558
+ "loss": 0.007,
559
+ "reward": 1.6917703241109847,
560
+ "reward_std": 0.8836216881871224,
561
+ "rewards/accuracy_reward": 0.6089285977184773,
562
+ "rewards/cosine_scaled_reward": 0.35307975246978457,
563
+ "rewards/format_reward": 0.0,
564
+ "rewards/reasoning_steps_reward": 0.7297619581222534,
565
+ "step": 180
566
+ },
567
+ {
568
+ "completion_length": 727.7018188476562,
569
+ "epoch": 0.39466666666666667,
570
+ "grad_norm": 1.1932159662246704,
571
+ "kl": 0.193988037109375,
572
+ "learning_rate": 2.272503808643123e-06,
573
+ "loss": 0.0078,
574
+ "reward": 1.7027929693460464,
575
+ "reward_std": 0.7921728197485208,
576
+ "rewards/accuracy_reward": 0.6267857421189547,
577
+ "rewards/cosine_scaled_reward": 0.3605310095474124,
578
+ "rewards/format_reward": 0.0,
579
+ "rewards/reasoning_steps_reward": 0.7154762461781502,
580
+ "step": 185
581
+ },
582
+ {
583
+ "completion_length": 677.6518127441407,
584
+ "epoch": 0.4053333333333333,
585
+ "grad_norm": 0.6525413393974304,
586
+ "kl": 0.1227813720703125,
587
+ "learning_rate": 2.2240033462759628e-06,
588
+ "loss": 0.0049,
589
+ "reward": 2.055608908832073,
590
+ "reward_std": 0.6409808352589608,
591
+ "rewards/accuracy_reward": 0.7428571667522192,
592
+ "rewards/cosine_scaled_reward": 0.4907278836122714,
593
+ "rewards/format_reward": 0.0,
594
+ "rewards/reasoning_steps_reward": 0.8220238700509072,
595
+ "step": 190
596
+ },
597
+ {
598
+ "completion_length": 729.3125358581543,
599
+ "epoch": 0.416,
600
+ "grad_norm": 0.470821738243103,
601
+ "kl": 0.1053009033203125,
602
+ "learning_rate": 2.1744951038678905e-06,
603
+ "loss": 0.0042,
604
+ "reward": 2.1352262631058694,
605
+ "reward_std": 0.6541992913931608,
606
+ "rewards/accuracy_reward": 0.7446428880095481,
607
+ "rewards/cosine_scaled_reward": 0.5340357202105224,
608
+ "rewards/format_reward": 0.0,
609
+ "rewards/reasoning_steps_reward": 0.8565476804971695,
610
+ "step": 195
611
+ },
612
+ {
613
+ "completion_length": 736.6607482910156,
614
+ "epoch": 0.4266666666666667,
615
+ "grad_norm": 0.3663829267024994,
616
+ "kl": 0.145220947265625,
617
+ "learning_rate": 2.124047994661941e-06,
618
+ "loss": 0.0058,
619
+ "reward": 2.0683016672730448,
620
+ "reward_std": 0.6785697277635336,
621
+ "rewards/accuracy_reward": 0.7107143150642514,
622
+ "rewards/cosine_scaled_reward": 0.4861587251536548,
623
+ "rewards/format_reward": 0.0,
624
+ "rewards/reasoning_steps_reward": 0.8714286342263222,
625
+ "step": 200
626
+ },
627
+ {
628
+ "epoch": 0.4266666666666667,
629
+ "eval_completion_length": 743.3604330322265,
630
+ "eval_kl": 0.1699279296875,
631
+ "eval_loss": 0.006734147202223539,
632
+ "eval_reward": 1.8947704853653908,
633
+ "eval_reward_std": 0.7092250557422638,
634
+ "eval_rewards/accuracy_reward": 0.6307143133163452,
635
+ "eval_rewards/cosine_scaled_reward": 0.39257041423644407,
636
+ "eval_rewards/format_reward": 0.0,
637
+ "eval_rewards/reasoning_steps_reward": 0.871485775399208,
638
+ "eval_runtime": 32670.592,
639
+ "eval_samples_per_second": 0.153,
640
+ "eval_steps_per_second": 0.011,
641
+ "step": 200
642
+ },
643
+ {
644
+ "completion_length": 752.7053955078125,
645
+ "epoch": 0.43733333333333335,
646
+ "grad_norm": 0.5299625396728516,
647
+ "kl": 0.1930633544921875,
648
+ "learning_rate": 2.072732238761434e-06,
649
+ "loss": 0.0077,
650
+ "reward": 1.8860187515616418,
651
+ "reward_std": 0.7606242794543505,
652
+ "rewards/accuracy_reward": 0.6446428863331676,
653
+ "rewards/cosine_scaled_reward": 0.40447108587541153,
654
+ "rewards/format_reward": 0.0,
655
+ "rewards/reasoning_steps_reward": 0.8369048193097115,
656
+ "step": 205
657
+ },
658
+ {
659
+ "completion_length": 733.603606414795,
660
+ "epoch": 0.448,
661
+ "grad_norm": 1.6152819395065308,
662
+ "kl": 0.219268798828125,
663
+ "learning_rate": 2.0206192653867536e-06,
664
+ "loss": 0.0088,
665
+ "reward": 1.997245892137289,
666
+ "reward_std": 0.7402419943362475,
667
+ "rewards/accuracy_reward": 0.7017857382073999,
668
+ "rewards/cosine_scaled_reward": 0.47284105569124224,
669
+ "rewards/format_reward": 0.0,
670
+ "rewards/reasoning_steps_reward": 0.8226191058754921,
671
+ "step": 210
672
+ },
673
+ {
674
+ "completion_length": 844.0661102294922,
675
+ "epoch": 0.45866666666666667,
676
+ "grad_norm": 7.516280651092529,
677
+ "kl": 0.27982177734375,
678
+ "learning_rate": 1.967781613449095e-06,
679
+ "loss": 0.0112,
680
+ "reward": 1.5464881896972655,
681
+ "reward_std": 0.8091491930186748,
682
+ "rewards/accuracy_reward": 0.49107144959270954,
683
+ "rewards/cosine_scaled_reward": 0.21672622584737838,
684
+ "rewards/format_reward": 0.0,
685
+ "rewards/reasoning_steps_reward": 0.8386905357241631,
686
+ "step": 215
687
+ },
688
+ {
689
+ "completion_length": 814.1696807861329,
690
+ "epoch": 0.4693333333333333,
691
+ "grad_norm": 0.4684678018093109,
692
+ "kl": 0.194140625,
693
+ "learning_rate": 1.9142928305795637e-06,
694
+ "loss": 0.0078,
695
+ "reward": 1.8477135568857193,
696
+ "reward_std": 0.7414120733737946,
697
+ "rewards/accuracy_reward": 0.6178571652621031,
698
+ "rewards/cosine_scaled_reward": 0.3584277655696496,
699
+ "rewards/format_reward": 0.0,
700
+ "rewards/reasoning_steps_reward": 0.8714286401867867,
701
+ "step": 220
702
+ },
703
+ {
704
+ "completion_length": 754.1857452392578,
705
+ "epoch": 0.48,
706
+ "grad_norm": 0.4328997731208801,
707
+ "kl": 0.12838134765625,
708
+ "learning_rate": 1.8602273707541886e-06,
709
+ "loss": 0.0051,
710
+ "reward": 2.1135876968503,
711
+ "reward_std": 0.6965163860470056,
712
+ "rewards/accuracy_reward": 0.742857176810503,
713
+ "rewards/cosine_scaled_reward": 0.5159685641527176,
714
+ "rewards/format_reward": 0.0,
715
+ "rewards/reasoning_steps_reward": 0.8547619715332985,
716
+ "step": 225
717
+ },
718
+ {
719
+ "completion_length": 742.7750381469726,
720
+ "epoch": 0.49066666666666664,
721
+ "grad_norm": 0.4649052619934082,
722
+ "kl": 0.1558837890625,
723
+ "learning_rate": 1.8056604906573418e-06,
724
+ "loss": 0.0062,
725
+ "reward": 2.0384344711899756,
726
+ "reward_std": 0.6620127268135547,
727
+ "rewards/accuracy_reward": 0.7035714626312256,
728
+ "rewards/cosine_scaled_reward": 0.483077246020548,
729
+ "rewards/format_reward": 0.0,
730
+ "rewards/reasoning_steps_reward": 0.8517857760190963,
731
+ "step": 230
732
+ },
733
+ {
734
+ "completion_length": 739.6268203735351,
735
+ "epoch": 0.5013333333333333,
736
+ "grad_norm": 1.5264660120010376,
737
+ "kl": 0.145806884765625,
738
+ "learning_rate": 1.7506681449278226e-06,
739
+ "loss": 0.0058,
740
+ "reward": 1.999456986784935,
741
+ "reward_std": 0.7032103724777699,
742
+ "rewards/accuracy_reward": 0.6785714574158191,
743
+ "rewards/cosine_scaled_reward": 0.45302835907787087,
744
+ "rewards/format_reward": 0.0,
745
+ "rewards/reasoning_steps_reward": 0.8678572103381157,
746
+ "step": 235
747
+ },
748
+ {
749
+ "completion_length": 725.905387878418,
750
+ "epoch": 0.512,
751
+ "grad_norm": 13.703657150268555,
752
+ "kl": 0.354132080078125,
753
+ "learning_rate": 1.6953268804334257e-06,
754
+ "loss": 0.0142,
755
+ "reward": 2.012031316757202,
756
+ "reward_std": 0.6349152896553278,
757
+ "rewards/accuracy_reward": 0.6660714553669095,
758
+ "rewards/cosine_scaled_reward": 0.46024551438167693,
759
+ "rewards/format_reward": 0.0,
760
+ "rewards/reasoning_steps_reward": 0.8857143551111222,
761
+ "step": 240
762
+ },
763
+ {
764
+ "completion_length": 711.9410980224609,
765
+ "epoch": 0.5226666666666666,
766
+ "grad_norm": 42.922752380371094,
767
+ "kl": 0.81356201171875,
768
+ "learning_rate": 1.6397137297211436e-06,
769
+ "loss": 0.0325,
770
+ "reward": 2.129089578986168,
771
+ "reward_std": 0.699107101932168,
772
+ "rewards/accuracy_reward": 0.7160714577883482,
773
+ "rewards/cosine_scaled_reward": 0.5064704709046055,
774
+ "rewards/format_reward": 0.0,
775
+ "rewards/reasoning_steps_reward": 0.9065476730465889,
776
+ "step": 245
777
+ },
778
+ {
779
+ "completion_length": 738.9821746826171,
780
+ "epoch": 0.5333333333333333,
781
+ "grad_norm": 212.6622314453125,
782
+ "kl": 1.157550048828125,
783
+ "learning_rate": 1.5839061037913395e-06,
784
+ "loss": 0.0463,
785
+ "reward": 2.1009622782468798,
786
+ "reward_std": 0.7158728931099176,
787
+ "rewards/accuracy_reward": 0.7000000283122063,
788
+ "rewards/cosine_scaled_reward": 0.5027479250915349,
789
+ "rewards/format_reward": 0.0,
790
+ "rewards/reasoning_steps_reward": 0.8982143506407738,
791
+ "step": 250
792
+ },
793
+ {
794
+ "completion_length": 760.2428916931152,
795
+ "epoch": 0.544,
796
+ "grad_norm": 10.118670463562012,
797
+ "kl": 0.637158203125,
798
+ "learning_rate": 1.527981684345115e-06,
799
+ "loss": 0.0255,
800
+ "reward": 1.9621681660413741,
801
+ "reward_std": 0.67494813259691,
802
+ "rewards/accuracy_reward": 0.639285740070045,
803
+ "rewards/cosine_scaled_reward": 0.4276442806003615,
804
+ "rewards/format_reward": 0.0,
805
+ "rewards/reasoning_steps_reward": 0.8952381491661072,
806
+ "step": 255
807
+ },
808
+ {
809
+ "completion_length": 754.6803894042969,
810
+ "epoch": 0.5546666666666666,
811
+ "grad_norm": 7.878048419952393,
812
+ "kl": 0.972845458984375,
813
+ "learning_rate": 1.4720183156548855e-06,
814
+ "loss": 0.0389,
815
+ "reward": 1.9780788227915764,
816
+ "reward_std": 0.6262619759887457,
817
+ "rewards/accuracy_reward": 0.6339285982772708,
818
+ "rewards/cosine_scaled_reward": 0.4304597085807472,
819
+ "rewards/format_reward": 0.0,
820
+ "rewards/reasoning_steps_reward": 0.9136905416846275,
821
+ "step": 260
822
+ },
823
+ {
824
+ "completion_length": 751.5857498168946,
825
+ "epoch": 0.5653333333333334,
826
+ "grad_norm": 12.42583179473877,
827
+ "kl": 3.09744873046875,
828
+ "learning_rate": 1.4160938962086612e-06,
829
+ "loss": 0.1241,
830
+ "reward": 2.0433208346366882,
831
+ "reward_std": 0.661328698694706,
832
+ "rewards/accuracy_reward": 0.676785740442574,
833
+ "rewards/cosine_scaled_reward": 0.44689220561413096,
834
+ "rewards/format_reward": 0.0,
835
+ "rewards/reasoning_steps_reward": 0.9196429163217544,
836
+ "step": 265
837
+ },
838
+ {
839
+ "completion_length": 729.028604888916,
840
+ "epoch": 0.576,
841
+ "grad_norm": 7.453009605407715,
842
+ "kl": 2.2955322265625,
843
+ "learning_rate": 1.3602862702788567e-06,
844
+ "loss": 0.0917,
845
+ "reward": 2.094664843380451,
846
+ "reward_std": 0.6356621380895376,
847
+ "rewards/accuracy_reward": 0.7000000346451998,
848
+ "rewards/cosine_scaled_reward": 0.46371242445893585,
849
+ "rewards/format_reward": 0.0,
850
+ "rewards/reasoning_steps_reward": 0.9309524431824684,
851
+ "step": 270
852
+ },
853
+ {
854
+ "completion_length": 730.825032043457,
855
+ "epoch": 0.5866666666666667,
856
+ "grad_norm": 7.0367817878723145,
857
+ "kl": 0.6509521484375,
858
+ "learning_rate": 1.3046731195665748e-06,
859
+ "loss": 0.0261,
860
+ "reward": 2.083331751823425,
861
+ "reward_std": 0.6676435235887765,
862
+ "rewards/accuracy_reward": 0.6821428818628192,
863
+ "rewards/cosine_scaled_reward": 0.45714118536561726,
864
+ "rewards/format_reward": 0.0,
865
+ "rewards/reasoning_steps_reward": 0.944047674536705,
866
+ "step": 275
867
+ },
868
+ {
869
+ "completion_length": 742.180387878418,
870
+ "epoch": 0.5973333333333334,
871
+ "grad_norm": 1.3236949443817139,
872
+ "kl": 4.09298095703125,
873
+ "learning_rate": 1.2493318550721775e-06,
874
+ "loss": 0.1637,
875
+ "reward": 2.075996032357216,
876
+ "reward_std": 0.6379393456503749,
877
+ "rewards/accuracy_reward": 0.6857143174856901,
878
+ "rewards/cosine_scaled_reward": 0.4563530746847391,
879
+ "rewards/format_reward": 0.0,
880
+ "rewards/reasoning_steps_reward": 0.9339286297559738,
881
+ "step": 280
882
+ },
883
+ {
884
+ "completion_length": 708.1018157958985,
885
+ "epoch": 0.608,
886
+ "grad_norm": 5.264936447143555,
887
+ "kl": 0.21192626953125,
888
+ "learning_rate": 1.1943395093426585e-06,
889
+ "loss": 0.0085,
890
+ "reward": 2.1390477627515794,
891
+ "reward_std": 0.600306774303317,
892
+ "rewards/accuracy_reward": 0.7196428820490837,
893
+ "rewards/cosine_scaled_reward": 0.49619057439267633,
894
+ "rewards/format_reward": 0.0,
895
+ "rewards/reasoning_steps_reward": 0.9232143506407737,
896
+ "step": 285
897
+ },
898
+ {
899
+ "completion_length": 715.4125289916992,
900
+ "epoch": 0.6186666666666667,
901
+ "grad_norm": 2.6887574195861816,
902
+ "kl": 2.8669677734375,
903
+ "learning_rate": 1.1397726292458115e-06,
904
+ "loss": 0.1151,
905
+ "reward": 2.1179503470659258,
906
+ "reward_std": 0.5490788316354156,
907
+ "rewards/accuracy_reward": 0.7053571708500386,
908
+ "rewards/cosine_scaled_reward": 0.4905693273060024,
909
+ "rewards/format_reward": 0.0,
910
+ "rewards/reasoning_steps_reward": 0.9220238789916039,
911
+ "step": 290
912
+ },
913
+ {
914
+ "completion_length": 742.6803916931152,
915
+ "epoch": 0.6293333333333333,
916
+ "grad_norm": 6.9418721199035645,
917
+ "kl": 0.39151611328125,
918
+ "learning_rate": 1.085707169420437e-06,
919
+ "loss": 0.0157,
920
+ "reward": 1.8962592497467994,
921
+ "reward_std": 0.6060247957706452,
922
+ "rewards/accuracy_reward": 0.5964285938069225,
923
+ "rewards/cosine_scaled_reward": 0.3754258565604687,
924
+ "rewards/format_reward": 0.0,
925
+ "rewards/reasoning_steps_reward": 0.924404813349247,
926
+ "step": 295
927
+ },
928
+ {
929
+ "completion_length": 716.3464584350586,
930
+ "epoch": 0.64,
931
+ "grad_norm": 4.2906060218811035,
932
+ "kl": 0.57667236328125,
933
+ "learning_rate": 1.0322183865509054e-06,
934
+ "loss": 0.0231,
935
+ "reward": 2.1815308302640917,
936
+ "reward_std": 0.6235232371836901,
937
+ "rewards/accuracy_reward": 0.7428571732714773,
938
+ "rewards/cosine_scaled_reward": 0.5255783690838143,
939
+ "rewards/format_reward": 0.0,
940
+ "rewards/reasoning_steps_reward": 0.913095298409462,
941
+ "step": 300
942
+ },
943
+ {
944
+ "epoch": 0.64,
945
+ "eval_completion_length": 728.9849459716797,
946
+ "eval_kl": 22.31169453125,
947
+ "eval_loss": 0.8926114439964294,
948
+ "eval_reward": 1.9843467233777046,
949
+ "eval_reward_std": 0.6538388645738363,
950
+ "eval_rewards/accuracy_reward": 0.6382285982251167,
951
+ "eval_rewards/cosine_scaled_reward": 0.41530855364510644,
952
+ "eval_rewards/format_reward": 0.0,
953
+ "eval_rewards/reasoning_steps_reward": 0.9308095807313919,
954
+ "eval_runtime": 32207.7986,
955
+ "eval_samples_per_second": 0.155,
956
+ "eval_steps_per_second": 0.011,
957
+ "step": 300
958
+ },
959
+ {
960
+ "completion_length": 723.2625328063965,
961
+ "epoch": 0.6506666666666666,
962
+ "grad_norm": 79.97950744628906,
963
+ "kl": 487.1179443359375,
964
+ "learning_rate": 9.793807346132464e-07,
965
+ "loss": 19.4474,
966
+ "reward": 2.162437987327576,
967
+ "reward_std": 0.6324797321110964,
968
+ "rewards/accuracy_reward": 0.7267857410013676,
969
+ "rewards/cosine_scaled_reward": 0.5112474345514784,
970
+ "rewards/format_reward": 0.0,
971
+ "rewards/reasoning_steps_reward": 0.9244048178195954,
972
+ "step": 305
973
+ },
974
+ {
975
+ "completion_length": 739.6375335693359,
976
+ "epoch": 0.6613333333333333,
977
+ "grad_norm": 9.395992279052734,
978
+ "kl": 0.60579833984375,
979
+ "learning_rate": 9.272677612385667e-07,
980
+ "loss": 0.0242,
981
+ "reward": 2.004467612504959,
982
+ "reward_std": 0.6282935816794634,
983
+ "rewards/accuracy_reward": 0.6607143184170127,
984
+ "rewards/cosine_scaled_reward": 0.42589613443706187,
985
+ "rewards/format_reward": 0.0,
986
+ "rewards/reasoning_steps_reward": 0.9178571999073029,
987
+ "step": 310
988
+ },
989
+ {
990
+ "completion_length": 735.6286071777344,
991
+ "epoch": 0.672,
992
+ "grad_norm": 12.830111503601074,
993
+ "kl": 0.9565673828125,
994
+ "learning_rate": 8.759520053380591e-07,
995
+ "loss": 0.0383,
996
+ "reward": 1.9197196617722512,
997
+ "reward_std": 0.6299623921513557,
998
+ "rewards/accuracy_reward": 0.6035714576020836,
999
+ "rewards/cosine_scaled_reward": 0.39055290608666837,
1000
+ "rewards/format_reward": 0.0,
1001
+ "rewards/reasoning_steps_reward": 0.9255953043699264,
1002
+ "step": 315
1003
+ },
1004
+ {
1005
+ "completion_length": 718.0571731567383,
1006
+ "epoch": 0.6826666666666666,
1007
+ "grad_norm": 176.6972198486328,
1008
+ "kl": 1.54287109375,
1009
+ "learning_rate": 8.255048961321088e-07,
1010
+ "loss": 0.0618,
1011
+ "reward": 2.1281729131937026,
1012
+ "reward_std": 0.6808584026992321,
1013
+ "rewards/accuracy_reward": 0.714285746216774,
1014
+ "rewards/cosine_scaled_reward": 0.4888871216215193,
1015
+ "rewards/format_reward": 0.0,
1016
+ "rewards/reasoning_steps_reward": 0.9250000536441803,
1017
+ "step": 320
1018
+ },
1019
+ {
1020
+ "completion_length": 721.4732475280762,
1021
+ "epoch": 0.6933333333333334,
1022
+ "grad_norm": 6.025720119476318,
1023
+ "kl": 0.98104248046875,
1024
+ "learning_rate": 7.759966537240373e-07,
1025
+ "loss": 0.0392,
1026
+ "reward": 2.054315000772476,
1027
+ "reward_std": 0.6834255807101727,
1028
+ "rewards/accuracy_reward": 0.6714285992085933,
1029
+ "rewards/cosine_scaled_reward": 0.45312447142787277,
1030
+ "rewards/format_reward": 0.0,
1031
+ "rewards/reasoning_steps_reward": 0.9297619640827179,
1032
+ "step": 325
1033
+ },
1034
+ {
1035
+ "completion_length": 729.3982498168946,
1036
+ "epoch": 0.704,
1037
+ "grad_norm": 6.682721138000488,
1038
+ "kl": 2.40982666015625,
1039
+ "learning_rate": 7.274961913568773e-07,
1040
+ "loss": 0.0964,
1041
+ "reward": 2.0376005843281746,
1042
+ "reward_std": 0.7055317234247923,
1043
+ "rewards/accuracy_reward": 0.6660714562982321,
1044
+ "rewards/cosine_scaled_reward": 0.4655766852200031,
1045
+ "rewards/format_reward": 0.0,
1046
+ "rewards/reasoning_steps_reward": 0.9059524461627007,
1047
+ "step": 330
1048
+ },
1049
+ {
1050
+ "completion_length": 737.005387878418,
1051
+ "epoch": 0.7146666666666667,
1052
+ "grad_norm": 21.818754196166992,
1053
+ "kl": 0.653094482421875,
1054
+ "learning_rate": 6.800710194892484e-07,
1055
+ "loss": 0.0261,
1056
+ "reward": 2.056803268194199,
1057
+ "reward_std": 0.7108213260769844,
1058
+ "rewards/accuracy_reward": 0.6660714574158192,
1059
+ "rewards/cosine_scaled_reward": 0.45680318772792816,
1060
+ "rewards/format_reward": 0.0,
1061
+ "rewards/reasoning_steps_reward": 0.9339286327362061,
1062
+ "step": 335
1063
+ },
1064
+ {
1065
+ "completion_length": 729.6393203735352,
1066
+ "epoch": 0.7253333333333334,
1067
+ "grad_norm": 4.025352954864502,
1068
+ "kl": 0.63848876953125,
1069
+ "learning_rate": 6.33787151823836e-07,
1070
+ "loss": 0.0256,
1071
+ "reward": 1.9720933943986894,
1072
+ "reward_std": 0.6898978160694241,
1073
+ "rewards/accuracy_reward": 0.6250000264495611,
1074
+ "rewards/cosine_scaled_reward": 0.42685523356776683,
1075
+ "rewards/format_reward": 0.0,
1076
+ "rewards/reasoning_steps_reward": 0.9202381581068039,
1077
+ "step": 340
1078
+ },
1079
+ {
1080
+ "completion_length": 699.1571701049804,
1081
+ "epoch": 0.736,
1082
+ "grad_norm": 5.142830848693848,
1083
+ "kl": 0.65721435546875,
1084
+ "learning_rate": 5.887090134192947e-07,
1085
+ "loss": 0.0263,
1086
+ "reward": 2.100009024143219,
1087
+ "reward_std": 0.6496724892407656,
1088
+ "rewards/accuracy_reward": 0.6910714615136385,
1089
+ "rewards/cosine_scaled_reward": 0.4851280112750828,
1090
+ "rewards/format_reward": 0.0,
1091
+ "rewards/reasoning_steps_reward": 0.9238095805048943,
1092
+ "step": 345
1093
+ },
1094
+ {
1095
+ "completion_length": 723.3910995483399,
1096
+ "epoch": 0.7466666666666667,
1097
+ "grad_norm": 4.602946758270264,
1098
+ "kl": 0.394140625,
1099
+ "learning_rate": 5.448993510134669e-07,
1100
+ "loss": 0.0158,
1101
+ "reward": 2.0926264360547067,
1102
+ "reward_std": 0.6916316740214825,
1103
+ "rewards/accuracy_reward": 0.6857143180444837,
1104
+ "rewards/cosine_scaled_reward": 0.4831025514518842,
1105
+ "rewards/format_reward": 0.0,
1106
+ "rewards/reasoning_steps_reward": 0.9238095790147781,
1107
+ "step": 350
1108
+ },
1109
+ {
1110
+ "completion_length": 722.5375305175781,
1111
+ "epoch": 0.7573333333333333,
1112
+ "grad_norm": 6.0756731033325195,
1113
+ "kl": 1.08592529296875,
1114
+ "learning_rate": 5.024191456827498e-07,
1115
+ "loss": 0.0435,
1116
+ "reward": 2.0994770556688307,
1117
+ "reward_std": 0.666194306127727,
1118
+ "rewards/accuracy_reward": 0.6982143167406321,
1119
+ "rewards/cosine_scaled_reward": 0.4917388891801238,
1120
+ "rewards/format_reward": 0.0,
1121
+ "rewards/reasoning_steps_reward": 0.9095238700509072,
1122
+ "step": 355
1123
+ },
1124
+ {
1125
+ "completion_length": 713.5250350952149,
1126
+ "epoch": 0.768,
1127
+ "grad_norm": 7.16264533996582,
1128
+ "kl": 26.27894287109375,
1129
+ "learning_rate": 4.6132752795918667e-07,
1130
+ "loss": 1.0497,
1131
+ "reward": 2.055359125137329,
1132
+ "reward_std": 0.7066416556015611,
1133
+ "rewards/accuracy_reward": 0.6678571753203869,
1134
+ "rewards/cosine_scaled_reward": 0.4732161985710263,
1135
+ "rewards/format_reward": 0.0,
1136
+ "rewards/reasoning_steps_reward": 0.9142857760190963,
1137
+ "step": 360
1138
+ },
1139
+ {
1140
+ "completion_length": 751.5964584350586,
1141
+ "epoch": 0.7786666666666666,
1142
+ "grad_norm": 3.023808002471924,
1143
+ "kl": 1.154327392578125,
1144
+ "learning_rate": 4.2168169552342905e-07,
1145
+ "loss": 0.0462,
1146
+ "reward": 1.9766315311193465,
1147
+ "reward_std": 0.7433438140898943,
1148
+ "rewards/accuracy_reward": 0.6339286021888256,
1149
+ "rewards/cosine_scaled_reward": 0.42544099894585086,
1150
+ "rewards/format_reward": 0.0,
1151
+ "rewards/reasoning_steps_reward": 0.9172619596123696,
1152
+ "step": 365
1153
+ },
1154
+ {
1155
+ "completion_length": 704.278596496582,
1156
+ "epoch": 0.7893333333333333,
1157
+ "grad_norm": 1.0741926431655884,
1158
+ "kl": 0.53934326171875,
1159
+ "learning_rate": 3.8353683358814046e-07,
1160
+ "loss": 0.0216,
1161
+ "reward": 2.0491741001605988,
1162
+ "reward_std": 0.587555892020464,
1163
+ "rewards/accuracy_reward": 0.6678571693599225,
1164
+ "rewards/cosine_scaled_reward": 0.46226926781237127,
1165
+ "rewards/format_reward": 0.0,
1166
+ "rewards/reasoning_steps_reward": 0.9190476790070534,
1167
+ "step": 370
1168
+ },
1169
+ {
1170
+ "completion_length": 738.875033569336,
1171
+ "epoch": 0.8,
1172
+ "grad_norm": 41.52888870239258,
1173
+ "kl": 0.6643310546875,
1174
+ "learning_rate": 3.469460380826697e-07,
1175
+ "loss": 0.0265,
1176
+ "reward": 2.0449665546417237,
1177
+ "reward_std": 0.6989724855870009,
1178
+ "rewards/accuracy_reward": 0.6625000312924385,
1179
+ "rewards/cosine_scaled_reward": 0.4574664521496743,
1180
+ "rewards/format_reward": 0.0,
1181
+ "rewards/reasoning_steps_reward": 0.9250000640749931,
1182
+ "step": 375
1183
+ },
1184
+ {
1185
+ "completion_length": 724.0678855895997,
1186
+ "epoch": 0.8106666666666666,
1187
+ "grad_norm": 4.322193145751953,
1188
+ "kl": 0.7086669921875,
1189
+ "learning_rate": 3.119602417459075e-07,
1190
+ "loss": 0.0284,
1191
+ "reward": 2.055614770948887,
1192
+ "reward_std": 0.6039443843066692,
1193
+ "rewards/accuracy_reward": 0.667857171408832,
1194
+ "rewards/cosine_scaled_reward": 0.46275755076203495,
1195
+ "rewards/format_reward": 0.0,
1196
+ "rewards/reasoning_steps_reward": 0.9250000655651093,
1197
+ "step": 380
1198
+ },
1199
+ {
1200
+ "completion_length": 739.5125350952148,
1201
+ "epoch": 0.8213333333333334,
1202
+ "grad_norm": 4.056361198425293,
1203
+ "kl": 0.7447509765625,
1204
+ "learning_rate": 2.786281432302071e-07,
1205
+ "loss": 0.0298,
1206
+ "reward": 2.0523035705089567,
1207
+ "reward_std": 0.6267267379909753,
1208
+ "rewards/accuracy_reward": 0.6750000279396773,
1209
+ "rewards/cosine_scaled_reward": 0.4463511134439614,
1210
+ "rewards/format_reward": 0.0,
1211
+ "rewards/reasoning_steps_reward": 0.9309524476528168,
1212
+ "step": 385
1213
+ },
1214
+ {
1215
+ "completion_length": 722.548243713379,
1216
+ "epoch": 0.832,
1217
+ "grad_norm": 1.378568410873413,
1218
+ "kl": 0.501007080078125,
1219
+ "learning_rate": 2.46996139315057e-07,
1220
+ "loss": 0.02,
1221
+ "reward": 2.0793206453323365,
1222
+ "reward_std": 0.6533296214416623,
1223
+ "rewards/accuracy_reward": 0.6875000290572644,
1224
+ "rewards/cosine_scaled_reward": 0.4781300783797633,
1225
+ "rewards/format_reward": 0.0,
1226
+ "rewards/reasoning_steps_reward": 0.9136905401945115,
1227
+ "step": 390
1228
+ },
1229
+ {
1230
+ "completion_length": 732.4714630126953,
1231
+ "epoch": 0.8426666666666667,
1232
+ "grad_norm": 2.4824626445770264,
1233
+ "kl": 0.71015625,
1234
+ "learning_rate": 2.1710826032485286e-07,
1235
+ "loss": 0.0284,
1236
+ "reward": 2.1464335188269614,
1237
+ "reward_std": 0.6267410140484572,
1238
+ "rewards/accuracy_reward": 0.7071428874507546,
1239
+ "rewards/cosine_scaled_reward": 0.5136953465640545,
1240
+ "rewards/format_reward": 0.0,
1241
+ "rewards/reasoning_steps_reward": 0.9255953013896943,
1242
+ "step": 395
1243
+ },
1244
+ {
1245
+ "completion_length": 769.8607528686523,
1246
+ "epoch": 0.8533333333333334,
1247
+ "grad_norm": 5.1279401779174805,
1248
+ "kl": 0.787158203125,
1249
+ "learning_rate": 1.8900610884066817e-07,
1250
+ "loss": 0.0315,
1251
+ "reward": 1.9811220198869706,
1252
+ "reward_std": 0.6900037627667188,
1253
+ "rewards/accuracy_reward": 0.6357143126428128,
1254
+ "rewards/cosine_scaled_reward": 0.4329076783033088,
1255
+ "rewards/format_reward": 0.0,
1256
+ "rewards/reasoning_steps_reward": 0.912500062584877,
1257
+ "step": 400
1258
+ },
1259
+ {
1260
+ "epoch": 0.8533333333333334,
1261
+ "eval_completion_length": 738.6478045166016,
1262
+ "eval_kl": 0.67065634765625,
1263
+ "eval_loss": 0.026821324601769447,
1264
+ "eval_reward": 1.9358687758922577,
1265
+ "eval_reward_std": 0.681571420711279,
1266
+ "eval_rewards/accuracy_reward": 0.6160857413113118,
1267
+ "eval_rewards/cosine_scaled_reward": 0.4032782297934056,
1268
+ "eval_rewards/format_reward": 0.0,
1269
+ "eval_rewards/reasoning_steps_reward": 0.9165048221349716,
1270
+ "eval_runtime": 32285.4404,
1271
+ "eval_samples_per_second": 0.155,
1272
+ "eval_steps_per_second": 0.011,
1273
+ "step": 400
1274
+ },
1275
+ {
1276
+ "completion_length": 763.4339599609375,
1277
+ "epoch": 0.864,
1278
+ "grad_norm": 4.143102169036865,
1279
+ "kl": 0.609136962890625,
1280
+ "learning_rate": 1.627288017913383e-07,
1281
+ "loss": 0.0244,
1282
+ "reward": 1.9788720414042473,
1283
+ "reward_std": 0.6925495602190495,
1284
+ "rewards/accuracy_reward": 0.6375000275671482,
1285
+ "rewards/cosine_scaled_reward": 0.42411007191985844,
1286
+ "rewards/format_reward": 0.0,
1287
+ "rewards/reasoning_steps_reward": 0.9172619670629502,
1288
+ "step": 405
1289
+ },
1290
+ {
1291
+ "completion_length": 754.2500312805175,
1292
+ "epoch": 0.8746666666666667,
1293
+ "grad_norm": 4.33268928527832,
1294
+ "kl": 0.9586181640625,
1295
+ "learning_rate": 1.3831291600445573e-07,
1296
+ "loss": 0.0383,
1297
+ "reward": 1.9650759071111679,
1298
+ "reward_std": 0.6423604141920805,
1299
+ "rewards/accuracy_reward": 0.6303571704775095,
1300
+ "rewards/cosine_scaled_reward": 0.4222186904400587,
1301
+ "rewards/format_reward": 0.0,
1302
+ "rewards/reasoning_steps_reward": 0.912500062584877,
1303
+ "step": 410
1304
+ },
1305
+ {
1306
+ "completion_length": 751.8071762084961,
1307
+ "epoch": 0.8853333333333333,
1308
+ "grad_norm": 7.097233295440674,
1309
+ "kl": 0.8556884765625,
1310
+ "learning_rate": 1.1579243729307487e-07,
1311
+ "loss": 0.0342,
1312
+ "reward": 1.9338065341114998,
1313
+ "reward_std": 0.7414230849593878,
1314
+ "rewards/accuracy_reward": 0.6321428898721934,
1315
+ "rewards/cosine_scaled_reward": 0.41178265907801687,
1316
+ "rewards/format_reward": 0.0,
1317
+ "rewards/reasoning_steps_reward": 0.8898810192942619,
1318
+ "step": 415
1319
+ },
1320
+ {
1321
+ "completion_length": 752.8571723937988,
1322
+ "epoch": 0.896,
1323
+ "grad_norm": 3.0274124145507812,
1324
+ "kl": 0.67294921875,
1325
+ "learning_rate": 9.519871314899092e-08,
1326
+ "loss": 0.0269,
1327
+ "reward": 1.9913182631134987,
1328
+ "reward_std": 0.7086525153368711,
1329
+ "rewards/accuracy_reward": 0.6571428876370191,
1330
+ "rewards/cosine_scaled_reward": 0.4359610580140725,
1331
+ "rewards/format_reward": 0.0,
1332
+ "rewards/reasoning_steps_reward": 0.8982143491506577,
1333
+ "step": 420
1334
+ },
1335
+ {
1336
+ "completion_length": 751.7411056518555,
1337
+ "epoch": 0.9066666666666666,
1338
+ "grad_norm": 1.3194289207458496,
1339
+ "kl": 0.722802734375,
1340
+ "learning_rate": 7.656040910844358e-08,
1341
+ "loss": 0.0289,
1342
+ "reward": 2.0188252568244933,
1343
+ "reward_std": 0.7707155652344226,
1344
+ "rewards/accuracy_reward": 0.644642885029316,
1345
+ "rewards/cosine_scaled_reward": 0.44144419142976404,
1346
+ "rewards/format_reward": 0.0,
1347
+ "rewards/reasoning_steps_reward": 0.9327381521463394,
1348
+ "step": 425
1349
+ },
1350
+ {
1351
+ "completion_length": 755.0464630126953,
1352
+ "epoch": 0.9173333333333333,
1353
+ "grad_norm": 4.276956081390381,
1354
+ "kl": 0.9569580078125,
1355
+ "learning_rate": 5.990346885098235e-08,
1356
+ "loss": 0.0383,
1357
+ "reward": 2.000167742371559,
1358
+ "reward_std": 0.7376608021557332,
1359
+ "rewards/accuracy_reward": 0.6589285988360644,
1360
+ "rewards/cosine_scaled_reward": 0.45314384531229734,
1361
+ "rewards/format_reward": 0.0,
1362
+ "rewards/reasoning_steps_reward": 0.8880952954292297,
1363
+ "step": 430
1364
+ },
1365
+ {
1366
+ "completion_length": 727.2536087036133,
1367
+ "epoch": 0.928,
1368
+ "grad_norm": 19.139204025268555,
1369
+ "kl": 1.32947998046875,
1370
+ "learning_rate": 4.5251078087033493e-08,
1371
+ "loss": 0.0532,
1372
+ "reward": 2.039694218337536,
1373
+ "reward_std": 0.6533694989979267,
1374
+ "rewards/accuracy_reward": 0.6732143165543676,
1375
+ "rewards/cosine_scaled_reward": 0.4462417368311435,
1376
+ "rewards/format_reward": 0.0,
1377
+ "rewards/reasoning_steps_reward": 0.9202381521463394,
1378
+ "step": 435
1379
+ },
1380
+ {
1381
+ "completion_length": 734.5536064147949,
1382
+ "epoch": 0.9386666666666666,
1383
+ "grad_norm": 9.922527313232422,
1384
+ "kl": 1.4177001953125,
1385
+ "learning_rate": 3.262363228443427e-08,
1386
+ "loss": 0.0567,
1387
+ "reward": 1.9774114236235618,
1388
+ "reward_std": 0.7198221303522587,
1389
+ "rewards/accuracy_reward": 0.6571428865194321,
1390
+ "rewards/cosine_scaled_reward": 0.4309827778954059,
1391
+ "rewards/format_reward": 0.0,
1392
+ "rewards/reasoning_steps_reward": 0.8892857789993286,
1393
+ "step": 440
1394
+ },
1395
+ {
1396
+ "completion_length": 755.5053962707519,
1397
+ "epoch": 0.9493333333333334,
1398
+ "grad_norm": 3.058717727661133,
1399
+ "kl": 1.02747802734375,
1400
+ "learning_rate": 2.2038708278862952e-08,
1401
+ "loss": 0.0411,
1402
+ "reward": 1.9413904681801797,
1403
+ "reward_std": 0.6192027345299721,
1404
+ "rewards/accuracy_reward": 0.6214285951107741,
1405
+ "rewards/cosine_scaled_reward": 0.41579519272781906,
1406
+ "rewards/format_reward": 0.0,
1407
+ "rewards/reasoning_steps_reward": 0.9041667267680168,
1408
+ "step": 445
1409
+ },
1410
+ {
1411
+ "completion_length": 723.6143173217773,
1412
+ "epoch": 0.96,
1413
+ "grad_norm": 2.64345383644104,
1414
+ "kl": 0.74544677734375,
1415
+ "learning_rate": 1.3511039807673209e-08,
1416
+ "loss": 0.0298,
1417
+ "reward": 2.1570381984114646,
1418
+ "reward_std": 0.6153812855482101,
1419
+ "rewards/accuracy_reward": 0.7089286003261804,
1420
+ "rewards/cosine_scaled_reward": 0.5165619559586048,
1421
+ "rewards/format_reward": 0.0,
1422
+ "rewards/reasoning_steps_reward": 0.9315476790070534,
1423
+ "step": 450
1424
+ },
1425
+ {
1426
+ "completion_length": 728.894679260254,
1427
+ "epoch": 0.9706666666666667,
1428
+ "grad_norm": 2.217505693435669,
1429
+ "kl": 0.678607177734375,
1430
+ "learning_rate": 7.0524970011963675e-09,
1431
+ "loss": 0.0272,
1432
+ "reward": 2.2157696574926375,
1433
+ "reward_std": 0.6317826233804226,
1434
+ "rewards/accuracy_reward": 0.7500000305473804,
1435
+ "rewards/cosine_scaled_reward": 0.5425553207285703,
1436
+ "rewards/format_reward": 0.0,
1437
+ "rewards/reasoning_steps_reward": 0.9232143491506577,
1438
+ "step": 455
1439
+ },
1440
+ {
1441
+ "completion_length": 722.3839637756348,
1442
+ "epoch": 0.9813333333333333,
1443
+ "grad_norm": 3.196773052215576,
1444
+ "kl": 0.709228515625,
1445
+ "learning_rate": 2.6720698600553595e-09,
1446
+ "loss": 0.0284,
1447
+ "reward": 2.122882993519306,
1448
+ "reward_std": 0.599827627837658,
1449
+ "rewards/accuracy_reward": 0.7017857432365417,
1450
+ "rewards/cosine_scaled_reward": 0.5175257750786841,
1451
+ "rewards/format_reward": 0.0,
1452
+ "rewards/reasoning_steps_reward": 0.9035714983940124,
1453
+ "step": 460
1454
+ },
1455
+ {
1456
+ "completion_length": 754.775032043457,
1457
+ "epoch": 0.992,
1458
+ "grad_norm": 8.455827713012695,
1459
+ "kl": 0.835205078125,
1460
+ "learning_rate": 3.7585574148779613e-10,
1461
+ "loss": 0.0334,
1462
+ "reward": 1.9985675051808358,
1463
+ "reward_std": 0.7642196819186211,
1464
+ "rewards/accuracy_reward": 0.6500000316649676,
1465
+ "rewards/cosine_scaled_reward": 0.4402340850589098,
1466
+ "rewards/format_reward": 0.0,
1467
+ "rewards/reasoning_steps_reward": 0.9083333894610405,
1468
+ "step": 465
1469
+ },
1470
+ {
1471
+ "completion_length": 746.1607462565104,
1472
+ "epoch": 0.9984,
1473
+ "kl": 0.8069661458333334,
1474
+ "reward": 2.0161508160332837,
1475
+ "reward_std": 0.7148686709503332,
1476
+ "rewards/accuracy_reward": 0.6636905111372471,
1477
+ "rewards/cosine_scaled_reward": 0.4536507367156446,
1478
+ "rewards/format_reward": 0.0,
1479
+ "rewards/reasoning_steps_reward": 0.898809589445591,
1480
+ "step": 468,
1481
+ "total_flos": 0.0,
1482
+ "train_loss": 4841.422249500714,
1483
+ "train_runtime": 180396.3107,
1484
+ "train_samples_per_second": 0.042,
1485
+ "train_steps_per_second": 0.003
1486
+ }
1487
+ ],
1488
+ "logging_steps": 5,
1489
+ "max_steps": 468,
1490
+ "num_input_tokens_seen": 0,
1491
+ "num_train_epochs": 1,
1492
+ "save_steps": 200,
1493
+ "stateful_callbacks": {
1494
+ "TrainerControl": {
1495
+ "args": {
1496
+ "should_epoch_stop": false,
1497
+ "should_evaluate": false,
1498
+ "should_log": false,
1499
+ "should_save": false,
1500
+ "should_training_stop": false
1501
+ },
1502
+ "attributes": {}
1503
+ }
1504
+ },
1505
+ "total_flos": 0.0,
1506
+ "train_batch_size": 2,
1507
+ "trial_name": null,
1508
+ "trial_params": null
1509
+ }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:dce257db8de0248ed189e939e06690f3b2a1bc42bd4d26daa1cac73a1b5e4131
3
  size 7480
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1373fdcb93653901ac277fd852565f578e438f5f5b258915dc888ae42be7ad33
3
  size 7480