simonycl commited on
Commit
ba59906
1 Parent(s): 4555faf

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -19,15 +19,15 @@ should probably proofread and complete it, then remove this comment. -->
19
 
20
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the simonycl/llama3.1-ultrafeedback-annotate-armorm dataset.
21
  It achieves the following results on the evaluation set:
22
- - Loss: 0.3984
23
- - Rewards/chosen: -3.3263
24
- - Rewards/rejected: -5.1260
25
- - Rewards/accuracies: 0.8286
26
- - Rewards/margins: 1.7997
27
- - Logps/rejected: -786.4965
28
- - Logps/chosen: -595.5199
29
- - Logits/rejected: -2.6865
30
- - Logits/chosen: -2.7593
31
 
32
  ## Model description
33
 
@@ -47,14 +47,14 @@ More information needed
47
 
48
  The following hyperparameters were used during training:
49
  - learning_rate: 5e-07
50
- - train_batch_size: 2
51
- - eval_batch_size: 4
52
  - seed: 42
53
  - distributed_type: multi-GPU
54
  - num_devices: 4
55
- - gradient_accumulation_steps: 16
56
  - total_train_batch_size: 128
57
- - total_eval_batch_size: 16
58
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
  - lr_scheduler_type: cosine
60
  - lr_scheduler_warmup_ratio: 0.1
@@ -64,7 +64,7 @@ The following hyperparameters were used during training:
64
 
65
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
- | 0.4222 | 0.8443 | 400 | 0.3984 | -3.3263 | -5.1260 | 0.8286 | 1.7997 | -786.4965 | -595.5199 | -2.6865 | -2.7593 |
68
 
69
 
70
  ### Framework versions
 
19
 
20
  This model is a fine-tuned version of [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) on the simonycl/llama3.1-ultrafeedback-annotate-armorm dataset.
21
  It achieves the following results on the evaluation set:
22
+ - Loss: 0.3837
23
+ - Rewards/chosen: -3.2511
24
+ - Rewards/rejected: -5.1202
25
+ - Rewards/accuracies: 0.8644
26
+ - Rewards/margins: 1.8691
27
+ - Logps/rejected: -797.6878
28
+ - Logps/chosen: -602.0981
29
+ - Logits/rejected: -1.3603
30
+ - Logits/chosen: -1.3921
31
 
32
  ## Model description
33
 
 
47
 
48
  The following hyperparameters were used during training:
49
  - learning_rate: 5e-07
50
+ - train_batch_size: 1
51
+ - eval_batch_size: 1
52
  - seed: 42
53
  - distributed_type: multi-GPU
54
  - num_devices: 4
55
+ - gradient_accumulation_steps: 32
56
  - total_train_batch_size: 128
57
+ - total_eval_batch_size: 4
58
  - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
59
  - lr_scheduler_type: cosine
60
  - lr_scheduler_warmup_ratio: 0.1
 
64
 
65
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
66
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
67
+ | 0.4269 | 0.8444 | 400 | 0.3837 | -3.2511 | -5.1202 | 0.8644 | 1.8691 | -797.6878 | -602.0981 | -1.3603 | -1.3921 |
68
 
69
 
70
  ### Framework versions
all_results.json CHANGED
@@ -1,22 +1,22 @@
1
  {
2
- "epoch": 0.9984168865435357,
3
- "eval_logits/chosen": -2.766282558441162,
4
- "eval_logits/rejected": -2.6962404251098633,
5
- "eval_logps/chosen": -612.2565307617188,
6
- "eval_logps/rejected": -809.2787475585938,
7
- "eval_loss": 0.3973737061023712,
8
- "eval_rewards/accuracies": 0.8286290168762207,
9
- "eval_rewards/chosen": -3.4936444759368896,
10
- "eval_rewards/margins": 1.8601694107055664,
11
- "eval_rewards/rejected": -5.353814125061035,
12
- "eval_runtime": 316.1088,
13
  "eval_samples": 1976,
14
- "eval_samples_per_second": 6.251,
15
- "eval_steps_per_second": 0.392,
16
  "total_flos": 0.0,
17
- "train_loss": 0.466365703316622,
18
- "train_runtime": 19524.7969,
19
  "train_samples": 60634,
20
- "train_samples_per_second": 3.105,
21
- "train_steps_per_second": 0.024
22
  }
 
1
  {
2
+ "epoch": 0.9984827495217362,
3
+ "eval_logits/chosen": -1.4033699035644531,
4
+ "eval_logits/rejected": -1.3698593378067017,
5
+ "eval_logps/chosen": -627.2847900390625,
6
+ "eval_logps/rejected": -837.4208374023438,
7
+ "eval_loss": 0.3820858299732208,
8
+ "eval_rewards/accuracies": 0.8663967847824097,
9
+ "eval_rewards/chosen": -3.5029525756835938,
10
+ "eval_rewards/margins": 2.0145938396453857,
11
+ "eval_rewards/rejected": -5.517546653747559,
12
+ "eval_runtime": 316.7033,
13
  "eval_samples": 1976,
14
+ "eval_samples_per_second": 6.239,
15
+ "eval_steps_per_second": 1.56,
16
  "total_flos": 0.0,
17
+ "train_loss": 0.4645486564767285,
18
+ "train_runtime": 24049.7915,
19
  "train_samples": 60634,
20
+ "train_samples_per_second": 2.521,
21
+ "train_steps_per_second": 0.02
22
  }
eval_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 0.9984168865435357,
3
- "eval_logits/chosen": -2.766282558441162,
4
- "eval_logits/rejected": -2.6962404251098633,
5
- "eval_logps/chosen": -612.2565307617188,
6
- "eval_logps/rejected": -809.2787475585938,
7
- "eval_loss": 0.3973737061023712,
8
- "eval_rewards/accuracies": 0.8286290168762207,
9
- "eval_rewards/chosen": -3.4936444759368896,
10
- "eval_rewards/margins": 1.8601694107055664,
11
- "eval_rewards/rejected": -5.353814125061035,
12
- "eval_runtime": 316.1088,
13
  "eval_samples": 1976,
14
- "eval_samples_per_second": 6.251,
15
- "eval_steps_per_second": 0.392
16
  }
 
1
  {
2
+ "epoch": 0.9984827495217362,
3
+ "eval_logits/chosen": -1.4033699035644531,
4
+ "eval_logits/rejected": -1.3698593378067017,
5
+ "eval_logps/chosen": -627.2847900390625,
6
+ "eval_logps/rejected": -837.4208374023438,
7
+ "eval_loss": 0.3820858299732208,
8
+ "eval_rewards/accuracies": 0.8663967847824097,
9
+ "eval_rewards/chosen": -3.5029525756835938,
10
+ "eval_rewards/margins": 2.0145938396453857,
11
+ "eval_rewards/rejected": -5.517546653747559,
12
+ "eval_runtime": 316.7033,
13
  "eval_samples": 1976,
14
+ "eval_samples_per_second": 6.239,
15
+ "eval_steps_per_second": 1.56
16
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b99de5acafddb523c85352e26759ee31d3fd7367d7cd431aa6b2e78cb1e0cd8
3
  size 4976698672
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9ba9f8b42762f10643f8def0ec6cfddc799529412cca31ac9f19337c5249491
3
  size 4976698672
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:baedc9d7e4a2f3e6703726c9aefe638306225965c932b69e48bdf92f2e35cf71
3
  size 4999802720
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6acb42a4e8fa15ce25a4843b4ca4e825d92626a38e32dfaad5076e605dc736ec
3
  size 4999802720
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a82bb7fe8dd7c5f6344976f170b6aed710c845c48c5d193c73020f86712b9ed1
3
  size 4915916176
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6a8af3d5fba0a84018754fe35dc14fde7ed61b7dc4799f667dd630e0798ab82d
3
  size 4915916176
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91d69130c212d01cd3d036336bfc584c3307c7f75402e95fc66b53cae79b0ccd
3
  size 1168138808
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a788bcf7589d73a13404a18ea27862b30c70bffb1514bb221b7aa5aa20773ae
3
  size 1168138808
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
- "epoch": 0.9984168865435357,
3
  "total_flos": 0.0,
4
- "train_loss": 0.466365703316622,
5
- "train_runtime": 19524.7969,
6
  "train_samples": 60634,
7
- "train_samples_per_second": 3.105,
8
- "train_steps_per_second": 0.024
9
  }
 
1
  {
2
+ "epoch": 0.9984827495217362,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.4645486564767285,
5
+ "train_runtime": 24049.7915,
6
  "train_samples": 60634,
7
+ "train_samples_per_second": 2.521,
8
+ "train_steps_per_second": 0.02
9
  }
trainer_state.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9984168865435357,
5
  "eval_steps": 400,
6
  "global_step": 473,
7
  "is_hyper_param_search": false,
@@ -9,13 +9,13 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0021108179419525065,
13
- "grad_norm": 3.841525938161017,
14
  "learning_rate": 1.0416666666666666e-08,
15
- "logits/chosen": -1.5679885149002075,
16
- "logits/rejected": -1.4838868379592896,
17
- "logps/chosen": -273.748046875,
18
- "logps/rejected": -278.32440185546875,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -24,1439 +24,1439 @@
24
  "step": 1
25
  },
26
  {
27
- "epoch": 0.010554089709762533,
28
- "grad_norm": 4.075044604292173,
29
  "learning_rate": 5.208333333333333e-08,
30
- "logits/chosen": -1.8661268949508667,
31
- "logits/rejected": -1.663633108139038,
32
- "logps/chosen": -259.7994384765625,
33
- "logps/rejected": -272.9507751464844,
34
- "loss": 0.6931,
35
- "rewards/accuracies": 0.4453125,
36
- "rewards/chosen": 0.0006091540562920272,
37
- "rewards/margins": 0.0006048179930076003,
38
- "rewards/rejected": 4.33622335549444e-06,
39
  "step": 5
40
  },
41
  {
42
- "epoch": 0.021108179419525065,
43
- "grad_norm": 3.8938427277220327,
44
  "learning_rate": 1.0416666666666667e-07,
45
- "logits/chosen": -1.9186642169952393,
46
- "logits/rejected": -1.7813522815704346,
47
- "logps/chosen": -260.3355407714844,
48
- "logps/rejected": -277.6410217285156,
49
  "loss": 0.6933,
50
- "rewards/accuracies": 0.512499988079071,
51
- "rewards/chosen": 0.0007230077171698213,
52
- "rewards/margins": -0.0004294753889553249,
53
- "rewards/rejected": 0.0011524828150868416,
54
  "step": 10
55
  },
56
  {
57
- "epoch": 0.0316622691292876,
58
- "grad_norm": 4.232192731720217,
59
  "learning_rate": 1.5624999999999999e-07,
60
- "logits/chosen": -1.9166736602783203,
61
- "logits/rejected": -1.6127517223358154,
62
- "logps/chosen": -262.7110900878906,
63
- "logps/rejected": -288.9376525878906,
64
- "loss": 0.6931,
65
- "rewards/accuracies": 0.5,
66
- "rewards/chosen": 0.0007503399974666536,
67
- "rewards/margins": 6.939703598618507e-05,
68
- "rewards/rejected": 0.0006809430196881294,
69
  "step": 15
70
  },
71
  {
72
- "epoch": 0.04221635883905013,
73
- "grad_norm": 4.119849835606016,
74
  "learning_rate": 2.0833333333333333e-07,
75
- "logits/chosen": -1.8074525594711304,
76
- "logits/rejected": -1.6753528118133545,
77
- "logps/chosen": -288.84808349609375,
78
- "logps/rejected": -297.88995361328125,
79
- "loss": 0.6932,
80
- "rewards/accuracies": 0.48124998807907104,
81
- "rewards/chosen": -0.0002116250980179757,
82
- "rewards/margins": -0.000452941982075572,
83
- "rewards/rejected": 0.00024131681129802018,
84
  "step": 20
85
  },
86
  {
87
- "epoch": 0.052770448548812667,
88
- "grad_norm": 4.422447549074996,
89
  "learning_rate": 2.604166666666667e-07,
90
- "logits/chosen": -1.8519093990325928,
91
- "logits/rejected": -1.6747506856918335,
92
- "logps/chosen": -276.16290283203125,
93
- "logps/rejected": -283.3067932128906,
94
- "loss": 0.6924,
95
- "rewards/accuracies": 0.59375,
96
- "rewards/chosen": -0.0022073048166930676,
97
- "rewards/margins": 0.001611467800103128,
98
- "rewards/rejected": -0.0038187727332115173,
99
  "step": 25
100
  },
101
  {
102
- "epoch": 0.0633245382585752,
103
- "grad_norm": 4.140769853407654,
104
  "learning_rate": 3.1249999999999997e-07,
105
- "logits/chosen": -1.8203039169311523,
106
- "logits/rejected": -1.6214573383331299,
107
- "logps/chosen": -254.4104461669922,
108
- "logps/rejected": -275.9024353027344,
109
- "loss": 0.6916,
110
- "rewards/accuracies": 0.6312500238418579,
111
- "rewards/chosen": -0.0038712085224688053,
112
- "rewards/margins": 0.004021945409476757,
113
- "rewards/rejected": -0.00789315439760685,
114
  "step": 30
115
  },
116
  {
117
- "epoch": 0.07387862796833773,
118
- "grad_norm": 4.0748094829519985,
119
  "learning_rate": 3.645833333333333e-07,
120
- "logits/chosen": -1.7195453643798828,
121
- "logits/rejected": -1.5980784893035889,
122
- "logps/chosen": -277.2474060058594,
123
- "logps/rejected": -279.6336364746094,
124
- "loss": 0.6903,
125
- "rewards/accuracies": 0.637499988079071,
126
- "rewards/chosen": -0.011106612160801888,
127
- "rewards/margins": 0.005168012343347073,
128
- "rewards/rejected": -0.016274623572826385,
129
  "step": 35
130
  },
131
  {
132
- "epoch": 0.08443271767810026,
133
- "grad_norm": 4.037161343642648,
134
  "learning_rate": 4.1666666666666667e-07,
135
- "logits/chosen": -1.8530025482177734,
136
- "logits/rejected": -1.6534423828125,
137
- "logps/chosen": -250.5609893798828,
138
- "logps/rejected": -266.48681640625,
139
- "loss": 0.6878,
140
- "rewards/accuracies": 0.762499988079071,
141
- "rewards/chosen": -0.013290290720760822,
142
- "rewards/margins": 0.01362483762204647,
143
- "rewards/rejected": -0.026915129274129868,
144
  "step": 40
145
  },
146
  {
147
- "epoch": 0.09498680738786279,
148
- "grad_norm": 4.20201566482073,
149
  "learning_rate": 4.6874999999999996e-07,
150
- "logits/chosen": -1.8621238470077515,
151
- "logits/rejected": -1.7357890605926514,
152
- "logps/chosen": -259.96875,
153
- "logps/rejected": -273.11651611328125,
154
- "loss": 0.6849,
155
- "rewards/accuracies": 0.706250011920929,
156
- "rewards/chosen": -0.02979857288300991,
157
- "rewards/margins": 0.019030530005693436,
158
- "rewards/rejected": -0.0488291010260582,
159
  "step": 45
160
  },
161
  {
162
- "epoch": 0.10554089709762533,
163
- "grad_norm": 4.392167523026418,
164
  "learning_rate": 4.999726797933858e-07,
165
- "logits/chosen": -1.9742714166641235,
166
- "logits/rejected": -1.761182188987732,
167
- "logps/chosen": -272.1903381347656,
168
- "logps/rejected": -285.57098388671875,
169
- "loss": 0.6753,
170
- "rewards/accuracies": 0.7250000238418579,
171
- "rewards/chosen": -0.048685222864151,
172
- "rewards/margins": 0.03682791069149971,
173
- "rewards/rejected": -0.08551312983036041,
174
  "step": 50
175
  },
176
  {
177
- "epoch": 0.11609498680738786,
178
- "grad_norm": 6.0936366972280105,
179
  "learning_rate": 4.99665396039775e-07,
180
- "logits/chosen": -1.9219143390655518,
181
- "logits/rejected": -1.8215105533599854,
182
- "logps/chosen": -269.31439208984375,
183
- "logps/rejected": -276.80401611328125,
184
- "loss": 0.659,
185
- "rewards/accuracies": 0.7250000238418579,
186
- "rewards/chosen": -0.10569655895233154,
187
- "rewards/margins": 0.0726684108376503,
188
- "rewards/rejected": -0.17836496233940125,
189
  "step": 55
190
  },
191
  {
192
- "epoch": 0.1266490765171504,
193
- "grad_norm": 7.231191310156758,
194
  "learning_rate": 4.99017099386437e-07,
195
- "logits/chosen": -2.0729923248291016,
196
- "logits/rejected": -1.9367930889129639,
197
- "logps/chosen": -298.20849609375,
198
- "logps/rejected": -349.7650146484375,
199
- "loss": 0.6298,
200
  "rewards/accuracies": 0.75,
201
- "rewards/chosen": -0.31456637382507324,
202
- "rewards/margins": 0.3051101565361023,
203
- "rewards/rejected": -0.6196764707565308,
204
  "step": 60
205
  },
206
  {
207
- "epoch": 0.13720316622691292,
208
- "grad_norm": 67.13648614495237,
209
  "learning_rate": 4.980286753286194e-07,
210
- "logits/chosen": -2.2857210636138916,
211
- "logits/rejected": -2.1148781776428223,
212
- "logps/chosen": -369.61749267578125,
213
- "logps/rejected": -430.94732666015625,
214
- "loss": 0.6277,
215
- "rewards/accuracies": 0.668749988079071,
216
- "rewards/chosen": -1.1958519220352173,
217
- "rewards/margins": 0.49135223031044006,
218
- "rewards/rejected": -1.6872040033340454,
219
  "step": 65
220
  },
221
  {
222
- "epoch": 0.14775725593667546,
223
- "grad_norm": 9.715273109578154,
224
  "learning_rate": 4.967014739346915e-07,
225
- "logits/chosen": -2.3191657066345215,
226
- "logits/rejected": -2.0927023887634277,
227
- "logps/chosen": -352.59075927734375,
228
- "logps/rejected": -438.1763610839844,
229
- "loss": 0.5858,
230
- "rewards/accuracies": 0.737500011920929,
231
- "rewards/chosen": -0.8432434797286987,
232
- "rewards/margins": 0.6638648509979248,
233
- "rewards/rejected": -1.5071083307266235,
234
  "step": 70
235
  },
236
  {
237
- "epoch": 0.158311345646438,
238
- "grad_norm": 9.799570258257988,
239
  "learning_rate": 4.950373080021136e-07,
240
- "logits/chosen": -2.159883499145508,
241
- "logits/rejected": -2.089489459991455,
242
- "logps/chosen": -327.1300964355469,
243
- "logps/rejected": -372.9543762207031,
244
- "loss": 0.5733,
245
- "rewards/accuracies": 0.706250011920929,
246
- "rewards/chosen": -0.625116229057312,
247
- "rewards/margins": 0.409213125705719,
248
- "rewards/rejected": -1.0343292951583862,
249
  "step": 75
250
  },
251
  {
252
- "epoch": 0.16886543535620052,
253
- "grad_norm": 21.779152085184286,
254
  "learning_rate": 4.930384505813737e-07,
255
- "logits/chosen": -2.304996967315674,
256
- "logits/rejected": -2.1810271739959717,
257
- "logps/chosen": -355.3009033203125,
258
- "logps/rejected": -471.39892578125,
259
- "loss": 0.5459,
260
- "rewards/accuracies": 0.7124999761581421,
261
- "rewards/chosen": -1.0548468828201294,
262
- "rewards/margins": 0.9830275774002075,
263
- "rewards/rejected": -2.037874221801758,
264
  "step": 80
265
  },
266
  {
267
- "epoch": 0.17941952506596306,
268
- "grad_norm": 14.56820002316678,
269
  "learning_rate": 4.907076318712738e-07,
270
- "logits/chosen": -2.2340409755706787,
271
- "logits/rejected": -2.080930233001709,
272
- "logps/chosen": -413.451416015625,
273
- "logps/rejected": -522.9191284179688,
274
- "loss": 0.5408,
275
- "rewards/accuracies": 0.731249988079071,
276
- "rewards/chosen": -1.4273126125335693,
277
- "rewards/margins": 0.9603279829025269,
278
- "rewards/rejected": -2.3876404762268066,
279
  "step": 85
280
  },
281
  {
282
- "epoch": 0.18997361477572558,
283
- "grad_norm": 15.919341883386638,
284
  "learning_rate": 4.88048035489807e-07,
285
- "logits/chosen": -2.174340009689331,
286
- "logits/rejected": -2.168853998184204,
287
- "logps/chosen": -394.6278076171875,
288
- "logps/rejected": -461.028564453125,
289
- "loss": 0.5463,
290
- "rewards/accuracies": 0.78125,
291
- "rewards/chosen": -1.2512483596801758,
292
- "rewards/margins": 0.6085057854652405,
293
- "rewards/rejected": -1.859754204750061,
294
  "step": 90
295
  },
296
  {
297
- "epoch": 0.20052770448548812,
298
- "grad_norm": 23.30417545081651,
299
  "learning_rate": 4.85063294125718e-07,
300
- "logits/chosen": -2.1903815269470215,
301
- "logits/rejected": -2.19649076461792,
302
- "logps/chosen": -459.72283935546875,
303
- "logps/rejected": -530.1971435546875,
304
- "loss": 0.5459,
305
- "rewards/accuracies": 0.6812499761581421,
306
- "rewards/chosen": -1.8961833715438843,
307
- "rewards/margins": 0.6760674715042114,
308
- "rewards/rejected": -2.5722508430480957,
309
  "step": 95
310
  },
311
  {
312
- "epoch": 0.21108179419525067,
313
- "grad_norm": 11.60980371327302,
314
  "learning_rate": 4.817574845766874e-07,
315
- "logits/chosen": -2.358705997467041,
316
- "logits/rejected": -2.307624340057373,
317
- "logps/chosen": -447.1853942871094,
318
- "logps/rejected": -532.86279296875,
319
- "loss": 0.5137,
320
  "rewards/accuracies": 0.7749999761581421,
321
- "rewards/chosen": -1.9070106744766235,
322
- "rewards/margins": 0.7790099382400513,
323
- "rewards/rejected": -2.686020612716675,
324
  "step": 100
325
  },
326
  {
327
- "epoch": 0.22163588390501318,
328
- "grad_norm": 14.306450146724028,
329
  "learning_rate": 4.781351221809166e-07,
330
- "logits/chosen": -2.2865371704101562,
331
- "logits/rejected": -2.176837921142578,
332
- "logps/chosen": -432.4977111816406,
333
- "logps/rejected": -542.9056396484375,
334
- "loss": 0.5261,
335
- "rewards/accuracies": 0.768750011920929,
336
- "rewards/chosen": -1.7739086151123047,
337
- "rewards/margins": 0.9299384951591492,
338
- "rewards/rejected": -2.7038469314575195,
339
  "step": 105
340
  },
341
  {
342
- "epoch": 0.23218997361477572,
343
- "grad_norm": 10.269899188048251,
344
  "learning_rate": 4.742011546497182e-07,
345
- "logits/chosen": -2.2152955532073975,
346
- "logits/rejected": -2.1580278873443604,
347
- "logps/chosen": -439.315185546875,
348
- "logps/rejected": -549.2676391601562,
349
- "loss": 0.494,
350
- "rewards/accuracies": 0.8125,
351
- "rewards/chosen": -1.7390915155410767,
352
- "rewards/margins": 0.9396551847457886,
353
- "rewards/rejected": -2.6787467002868652,
354
  "step": 110
355
  },
356
  {
357
- "epoch": 0.24274406332453827,
358
- "grad_norm": 16.644175161757378,
359
  "learning_rate": 4.6996095530953875e-07,
360
- "logits/chosen": -2.3286213874816895,
361
- "logits/rejected": -2.2058520317077637,
362
- "logps/chosen": -506.5923767089844,
363
- "logps/rejected": -658.1654052734375,
364
- "loss": 0.4994,
365
- "rewards/accuracies": 0.8374999761581421,
366
- "rewards/chosen": -2.3760502338409424,
367
- "rewards/margins": 1.3993351459503174,
368
- "rewards/rejected": -3.7753853797912598,
369
  "step": 115
370
  },
371
  {
372
- "epoch": 0.2532981530343008,
373
- "grad_norm": 18.458409874645245,
374
  "learning_rate": 4.654203157626399e-07,
375
- "logits/chosen": -2.363788366317749,
376
- "logits/rejected": -2.2831900119781494,
377
- "logps/chosen": -476.95831298828125,
378
- "logps/rejected": -650.87841796875,
379
- "loss": 0.4745,
380
- "rewards/accuracies": 0.8062499761581421,
381
- "rewards/chosen": -2.333024501800537,
382
- "rewards/margins": 1.5511460304260254,
383
- "rewards/rejected": -3.8841705322265625,
384
  "step": 120
385
  },
386
  {
387
- "epoch": 0.2638522427440633,
388
- "grad_norm": 24.30561683820342,
389
  "learning_rate": 4.605854379764673e-07,
390
- "logits/chosen": -2.2180769443511963,
391
- "logits/rejected": -2.1058664321899414,
392
- "logps/chosen": -458.69317626953125,
393
- "logps/rejected": -573.3502807617188,
394
- "loss": 0.4683,
395
- "rewards/accuracies": 0.7749999761581421,
396
- "rewards/chosen": -2.0290579795837402,
397
- "rewards/margins": 1.0193411111831665,
398
- "rewards/rejected": -3.048398971557617,
399
  "step": 125
400
  },
401
  {
402
- "epoch": 0.27440633245382584,
403
- "grad_norm": 25.019298570271868,
404
  "learning_rate": 4.5546292581250857e-07,
405
- "logits/chosen": -2.2698774337768555,
406
- "logits/rejected": -2.150057554244995,
407
- "logps/chosen": -563.2131958007812,
408
- "logps/rejected": -722.5281372070312,
409
- "loss": 0.4752,
410
- "rewards/accuracies": 0.8125,
411
- "rewards/chosen": -2.93915057182312,
412
- "rewards/margins": 1.441446304321289,
413
- "rewards/rejected": -4.380597114562988,
414
  "step": 130
415
  },
416
  {
417
- "epoch": 0.2849604221635884,
418
- "grad_norm": 10.994821669390042,
419
  "learning_rate": 4.5005977600621275e-07,
420
- "logits/chosen": -2.243281841278076,
421
- "logits/rejected": -2.2170357704162598,
422
- "logps/chosen": -536.69970703125,
423
- "logps/rejected": -645.5635986328125,
424
- "loss": 0.4739,
425
- "rewards/accuracies": 0.7562500238418579,
426
- "rewards/chosen": -2.6128830909729004,
427
- "rewards/margins": 1.0102598667144775,
428
- "rewards/rejected": -3.623142957687378,
429
  "step": 135
430
  },
431
  {
432
- "epoch": 0.2955145118733509,
433
- "grad_norm": 16.90701177792478,
434
  "learning_rate": 4.443833686102919e-07,
435
- "logits/chosen": -2.1392781734466553,
436
- "logits/rejected": -2.0879039764404297,
437
- "logps/chosen": -433.86590576171875,
438
- "logps/rejected": -533.6943359375,
439
- "loss": 0.4645,
440
- "rewards/accuracies": 0.8062499761581421,
441
- "rewards/chosen": -1.7928444147109985,
442
- "rewards/margins": 0.9198592901229858,
443
- "rewards/rejected": -2.712703227996826,
444
  "step": 140
445
  },
446
  {
447
- "epoch": 0.30606860158311344,
448
- "grad_norm": 23.854657702935985,
449
  "learning_rate": 4.384414569144561e-07,
450
- "logits/chosen": -2.3052217960357666,
451
- "logits/rejected": -2.207017421722412,
452
- "logps/chosen": -529.6088256835938,
453
- "logps/rejected": -723.9100341796875,
454
- "loss": 0.4979,
455
- "rewards/accuracies": 0.8187500238418579,
456
- "rewards/chosen": -2.7643752098083496,
457
- "rewards/margins": 1.82810378074646,
458
- "rewards/rejected": -4.592479228973389,
459
  "step": 145
460
  },
461
  {
462
- "epoch": 0.316622691292876,
463
- "grad_norm": 19.353784387057143,
464
  "learning_rate": 4.3224215685535287e-07,
465
- "logits/chosen": -2.0858356952667236,
466
- "logits/rejected": -1.950209617614746,
467
- "logps/chosen": -505.2822265625,
468
- "logps/rejected": -661.0929565429688,
469
- "loss": 0.4656,
470
- "rewards/accuracies": 0.793749988079071,
471
- "rewards/chosen": -2.391045093536377,
472
- "rewards/margins": 1.4024264812469482,
473
- "rewards/rejected": -3.793471097946167,
474
  "step": 150
475
  },
476
  {
477
- "epoch": 0.32717678100263853,
478
- "grad_norm": 17.72909970129764,
479
  "learning_rate": 4.2579393593117364e-07,
480
- "logits/chosen": -2.0300238132476807,
481
- "logits/rejected": -1.9049923419952393,
482
- "logps/chosen": -496.39324951171875,
483
- "logps/rejected": -680.350341796875,
484
- "loss": 0.4412,
485
- "rewards/accuracies": 0.800000011920929,
486
- "rewards/chosen": -2.427093982696533,
487
- "rewards/margins": 1.5917612314224243,
488
- "rewards/rejected": -4.018855094909668,
489
  "step": 155
490
  },
491
  {
492
- "epoch": 0.33773087071240104,
493
- "grad_norm": 17.1778742252489,
494
  "learning_rate": 4.191056016360699e-07,
495
- "logits/chosen": -2.215439558029175,
496
- "logits/rejected": -2.1087276935577393,
497
- "logps/chosen": -615.9310302734375,
498
- "logps/rejected": -818.6203002929688,
499
- "loss": 0.4622,
500
- "rewards/accuracies": 0.762499988079071,
501
- "rewards/chosen": -3.431640148162842,
502
- "rewards/margins": 1.9439836740493774,
503
- "rewards/rejected": -5.37562370300293,
504
  "step": 160
505
  },
506
  {
507
- "epoch": 0.3482849604221636,
508
- "grad_norm": 17.97809867221494,
509
  "learning_rate": 4.121862894301754e-07,
510
- "logits/chosen": -2.0415732860565186,
511
- "logits/rejected": -1.94220769405365,
512
- "logps/chosen": -498.63116455078125,
513
- "logps/rejected": -657.0416259765625,
514
- "loss": 0.479,
515
- "rewards/accuracies": 0.75,
516
- "rewards/chosen": -2.2521657943725586,
517
- "rewards/margins": 1.3823236227035522,
518
- "rewards/rejected": -3.6344895362854004,
519
  "step": 165
520
  },
521
  {
522
- "epoch": 0.35883905013192613,
523
- "grad_norm": 30.072934787327185,
524
  "learning_rate": 4.050454502616667e-07,
525
- "logits/chosen": -2.118239164352417,
526
- "logits/rejected": -2.090146541595459,
527
- "logps/chosen": -526.2330322265625,
528
- "logps/rejected": -668.2966918945312,
529
- "loss": 0.4648,
530
- "rewards/accuracies": 0.762499988079071,
531
- "rewards/chosen": -2.6902260780334473,
532
- "rewards/margins": 1.3285554647445679,
533
- "rewards/rejected": -4.0187811851501465,
534
  "step": 170
535
  },
536
  {
537
- "epoch": 0.36939313984168864,
538
- "grad_norm": 29.035498895998003,
539
  "learning_rate": 3.976928376579047e-07,
540
- "logits/chosen": -2.3821628093719482,
541
- "logits/rejected": -2.2632079124450684,
542
- "logps/chosen": -557.0284423828125,
543
- "logps/rejected": -771.6123657226562,
544
- "loss": 0.4449,
545
- "rewards/accuracies": 0.8374999761581421,
546
- "rewards/chosen": -2.932424545288086,
547
- "rewards/margins": 1.8806273937225342,
548
- "rewards/rejected": -4.813051223754883,
549
  "step": 175
550
  },
551
  {
552
- "epoch": 0.37994722955145116,
553
- "grad_norm": 15.777007984898162,
554
  "learning_rate": 3.9013849440328945e-07,
555
- "logits/chosen": -2.286719560623169,
556
- "logits/rejected": -2.162851095199585,
557
- "logps/chosen": -564.4080200195312,
558
- "logps/rejected": -720.1937255859375,
559
- "loss": 0.467,
560
- "rewards/accuracies": 0.8187500238418579,
561
- "rewards/chosen": -2.908557415008545,
562
- "rewards/margins": 1.4215553998947144,
563
- "rewards/rejected": -4.330113410949707,
564
  "step": 180
565
  },
566
  {
567
- "epoch": 0.39050131926121373,
568
- "grad_norm": 12.25325652821894,
569
  "learning_rate": 3.8239273882202473e-07,
570
- "logits/chosen": -2.195413589477539,
571
- "logits/rejected": -2.2209365367889404,
572
- "logps/chosen": -495.92938232421875,
573
- "logps/rejected": -645.3634643554688,
574
- "loss": 0.469,
575
- "rewards/accuracies": 0.793749988079071,
576
- "rewards/chosen": -2.377912759780884,
577
- "rewards/margins": 1.3328666687011719,
578
- "rewards/rejected": -3.7107791900634766,
579
  "step": 185
580
  },
581
  {
582
- "epoch": 0.40105540897097625,
583
- "grad_norm": 13.405956669044865,
584
  "learning_rate": 3.7446615068452804e-07,
585
- "logits/chosen": -2.128485918045044,
586
- "logits/rejected": -2.0320448875427246,
587
- "logps/chosen": -500.07598876953125,
588
- "logps/rejected": -665.8009643554688,
589
- "loss": 0.4456,
590
  "rewards/accuracies": 0.831250011920929,
591
- "rewards/chosen": -2.381704092025757,
592
- "rewards/margins": 1.4976516962051392,
593
- "rewards/rejected": -3.8793559074401855,
594
  "step": 190
595
  },
596
  {
597
- "epoch": 0.41160949868073876,
598
- "grad_norm": 16.531263865887837,
599
  "learning_rate": 3.6636955675673743e-07,
600
- "logits/chosen": -2.1537322998046875,
601
- "logits/rejected": -2.151557207107544,
602
- "logps/chosen": -563.8980102539062,
603
- "logps/rejected": -719.9155883789062,
604
- "loss": 0.4301,
605
- "rewards/accuracies": 0.824999988079071,
606
- "rewards/chosen": -2.8693251609802246,
607
- "rewards/margins": 1.4603594541549683,
608
- "rewards/rejected": -4.329684734344482,
609
  "step": 195
610
  },
611
  {
612
- "epoch": 0.42216358839050133,
613
- "grad_norm": 25.62641100404869,
614
  "learning_rate": 3.5811401601205093e-07,
615
- "logits/chosen": -2.1722164154052734,
616
- "logits/rejected": -2.2210490703582764,
617
- "logps/chosen": -547.766845703125,
618
- "logps/rejected": -697.3842163085938,
619
- "loss": 0.4585,
620
- "rewards/accuracies": 0.7749999761581421,
621
- "rewards/chosen": -2.807692766189575,
622
- "rewards/margins": 1.545601725578308,
623
- "rewards/rejected": -4.353294372558594,
624
  "step": 200
625
  },
626
  {
627
- "epoch": 0.43271767810026385,
628
- "grad_norm": 15.253711310557463,
629
  "learning_rate": 3.497108045260995e-07,
630
- "logits/chosen": -2.0688979625701904,
631
- "logits/rejected": -2.104271173477173,
632
- "logps/chosen": -529.1517333984375,
633
- "logps/rejected": -676.9817504882812,
634
- "loss": 0.4423,
635
- "rewards/accuracies": 0.8187500238418579,
636
- "rewards/chosen": -2.4731733798980713,
637
- "rewards/margins": 1.4461132287979126,
638
- "rewards/rejected": -3.9192867279052734,
639
  "step": 205
640
  },
641
  {
642
- "epoch": 0.44327176781002636,
643
- "grad_norm": 24.083715768462596,
644
  "learning_rate": 3.411714000749838e-07,
645
- "logits/chosen": -2.2706260681152344,
646
- "logits/rejected": -2.135749340057373,
647
- "logps/chosen": -541.0875854492188,
648
- "logps/rejected": -750.7264404296875,
649
- "loss": 0.4354,
650
- "rewards/accuracies": 0.862500011920929,
651
- "rewards/chosen": -2.909759283065796,
652
- "rewards/margins": 1.8729289770126343,
653
- "rewards/rejected": -4.782688140869141,
654
  "step": 210
655
  },
656
  {
657
- "epoch": 0.45382585751978893,
658
- "grad_norm": 35.56607178592358,
659
  "learning_rate": 3.3250746645801287e-07,
660
- "logits/chosen": -2.263277769088745,
661
- "logits/rejected": -2.205223560333252,
662
- "logps/chosen": -608.2554931640625,
663
- "logps/rejected": -830.9841918945312,
664
- "loss": 0.4321,
665
- "rewards/accuracies": 0.7875000238418579,
666
- "rewards/chosen": -3.611112117767334,
667
- "rewards/margins": 2.0616540908813477,
668
- "rewards/rejected": -5.672766208648682,
669
  "step": 215
670
  },
671
  {
672
- "epoch": 0.46437994722955145,
673
- "grad_norm": 15.718670222248921,
674
  "learning_rate": 3.237308375663571e-07,
675
- "logits/chosen": -2.230881452560425,
676
- "logits/rejected": -2.121683359146118,
677
- "logps/chosen": -576.1820678710938,
678
- "logps/rejected": -769.9691772460938,
679
- "loss": 0.3944,
680
- "rewards/accuracies": 0.8687499761581421,
681
- "rewards/chosen": -3.302262783050537,
682
- "rewards/margins": 1.794217824935913,
683
- "rewards/rejected": -5.096480369567871,
684
  "step": 220
685
  },
686
  {
687
- "epoch": 0.47493403693931396,
688
- "grad_norm": 19.204923979579966,
689
  "learning_rate": 3.148535012193767e-07,
690
- "logits/chosen": -2.1568782329559326,
691
- "logits/rejected": -2.092639684677124,
692
- "logps/chosen": -615.4882202148438,
693
- "logps/rejected": -833.5153198242188,
694
- "loss": 0.3871,
695
- "rewards/accuracies": 0.856249988079071,
696
- "rewards/chosen": -3.338430881500244,
697
- "rewards/margins": 2.001122236251831,
698
- "rewards/rejected": -5.339552879333496,
699
  "step": 225
700
  },
701
  {
702
- "epoch": 0.48548812664907653,
703
- "grad_norm": 23.052920344271605,
704
  "learning_rate": 3.0588758279070183e-07,
705
- "logits/chosen": -2.2185590267181396,
706
- "logits/rejected": -2.13350772857666,
707
- "logps/chosen": -622.9224853515625,
708
- "logps/rejected": -836.8287353515625,
709
- "loss": 0.4125,
710
  "rewards/accuracies": 0.824999988079071,
711
- "rewards/chosen": -3.6510891914367676,
712
- "rewards/margins": 1.8919486999511719,
713
- "rewards/rejected": -5.5430378913879395,
714
  "step": 230
715
  },
716
  {
717
- "epoch": 0.49604221635883905,
718
- "grad_norm": 16.46282996942275,
719
  "learning_rate": 2.968453286464312e-07,
720
- "logits/chosen": -2.097414493560791,
721
- "logits/rejected": -2.146594524383545,
722
- "logps/chosen": -590.5551147460938,
723
- "logps/rejected": -758.9312744140625,
724
- "loss": 0.4164,
725
- "rewards/accuracies": 0.8374999761581421,
726
- "rewards/chosen": -3.0970864295959473,
727
- "rewards/margins": 1.6357967853546143,
728
- "rewards/rejected": -4.732882976531982,
729
  "step": 235
730
  },
731
  {
732
- "epoch": 0.5065963060686016,
733
- "grad_norm": 26.112494003766066,
734
  "learning_rate": 2.8773908941806877e-07,
735
- "logits/chosen": -2.0698182582855225,
736
- "logits/rejected": -2.076683521270752,
737
- "logps/chosen": -617.1507568359375,
738
- "logps/rejected": -853.2135620117188,
739
- "loss": 0.3982,
740
- "rewards/accuracies": 0.8187500238418579,
741
- "rewards/chosen": -3.5680668354034424,
742
- "rewards/margins": 2.141855001449585,
743
- "rewards/rejected": -5.709921836853027,
744
  "step": 240
745
  },
746
  {
747
- "epoch": 0.5171503957783641,
748
- "grad_norm": 20.932946542012903,
749
  "learning_rate": 2.785813031330473e-07,
750
- "logits/chosen": -2.1454832553863525,
751
- "logits/rejected": -2.16323184967041,
752
- "logps/chosen": -661.7200317382812,
753
- "logps/rejected": -866.1280517578125,
754
- "loss": 0.4092,
755
- "rewards/accuracies": 0.75,
756
- "rewards/chosen": -4.027346134185791,
757
- "rewards/margins": 1.9131158590316772,
758
- "rewards/rejected": -5.940462112426758,
759
  "step": 245
760
  },
761
  {
762
- "epoch": 0.5277044854881267,
763
- "grad_norm": 15.896790069729533,
764
  "learning_rate": 2.693844782258779e-07,
765
- "logits/chosen": -2.030596971511841,
766
- "logits/rejected": -1.9922313690185547,
767
- "logps/chosen": -571.3850708007812,
768
- "logps/rejected": -776.018310546875,
769
- "loss": 0.3852,
770
- "rewards/accuracies": 0.893750011920929,
771
- "rewards/chosen": -3.086763620376587,
772
- "rewards/margins": 1.9301198720932007,
773
- "rewards/rejected": -5.016883850097656,
774
  "step": 250
775
  },
776
  {
777
- "epoch": 0.5382585751978892,
778
- "grad_norm": 40.59897974633979,
779
  "learning_rate": 2.601611764531342e-07,
780
- "logits/chosen": -2.153049945831299,
781
- "logits/rejected": -2.1268014907836914,
782
- "logps/chosen": -659.8489990234375,
783
- "logps/rejected": -876.6301879882812,
784
- "loss": 0.4062,
785
- "rewards/accuracies": 0.8500000238418579,
786
- "rewards/chosen": -4.000391483306885,
787
- "rewards/margins": 2.1393191814422607,
788
- "rewards/rejected": -6.139710426330566,
789
  "step": 255
790
  },
791
  {
792
- "epoch": 0.5488126649076517,
793
- "grad_norm": 16.71817267029077,
794
  "learning_rate": 2.5092399573560323e-07,
795
- "logits/chosen": -2.236642599105835,
796
- "logits/rejected": -2.24824857711792,
797
- "logps/chosen": -675.7197265625,
798
- "logps/rejected": -906.6882934570312,
799
- "loss": 0.4331,
800
- "rewards/accuracies": 0.8125,
801
- "rewards/chosen": -4.17581844329834,
802
- "rewards/margins": 2.249803304672241,
803
- "rewards/rejected": -6.42562198638916,
804
  "step": 260
805
  },
806
  {
807
- "epoch": 0.5593667546174143,
808
- "grad_norm": 21.83948507996357,
809
  "learning_rate": 2.4168555295104124e-07,
810
- "logits/chosen": -2.185378313064575,
811
- "logits/rejected": -2.1056790351867676,
812
- "logps/chosen": -593.40283203125,
813
- "logps/rejected": -796.1773071289062,
814
- "loss": 0.4083,
815
- "rewards/accuracies": 0.824999988079071,
816
- "rewards/chosen": -3.3149795532226562,
817
- "rewards/margins": 1.8634592294692993,
818
- "rewards/rejected": -5.178439140319824,
819
  "step": 265
820
  },
821
  {
822
- "epoch": 0.5699208443271768,
823
- "grad_norm": 20.351730101984266,
824
  "learning_rate": 2.3245846670103626e-07,
825
- "logits/chosen": -2.268347978591919,
826
- "logits/rejected": -2.2143301963806152,
827
- "logps/chosen": -588.11474609375,
828
- "logps/rejected": -783.8377075195312,
829
- "loss": 0.3935,
830
- "rewards/accuracies": 0.824999988079071,
831
- "rewards/chosen": -3.3238461017608643,
832
- "rewards/margins": 1.8138678073883057,
833
- "rewards/rejected": -5.13771390914917,
834
  "step": 270
835
  },
836
  {
837
- "epoch": 0.5804749340369393,
838
- "grad_norm": 18.56747674948443,
839
  "learning_rate": 2.232553400755159e-07,
840
- "logits/chosen": -2.4159321784973145,
841
- "logits/rejected": -2.3257503509521484,
842
- "logps/chosen": -631.1921997070312,
843
- "logps/rejected": -876.8099365234375,
844
- "loss": 0.3663,
845
- "rewards/accuracies": 0.7875000238418579,
846
- "rewards/chosen": -3.7254486083984375,
847
- "rewards/margins": 2.293728828430176,
848
- "rewards/rejected": -6.0191779136657715,
849
  "step": 275
850
  },
851
  {
852
- "epoch": 0.5910290237467019,
853
- "grad_norm": 21.05078294350066,
854
  "learning_rate": 2.1408874343844294e-07,
855
- "logits/chosen": -2.3609871864318848,
856
- "logits/rejected": -2.229645013809204,
857
- "logps/chosen": -681.2824096679688,
858
- "logps/rejected": -997.8416748046875,
859
- "loss": 0.3917,
860
- "rewards/accuracies": 0.8125,
861
- "rewards/chosen": -4.151437282562256,
862
- "rewards/margins": 2.831943988800049,
863
- "rewards/rejected": -6.9833807945251465,
864
  "step": 280
865
  },
866
  {
867
- "epoch": 0.6015831134564644,
868
- "grad_norm": 17.819286464723362,
869
  "learning_rate": 2.049711972582101e-07,
870
- "logits/chosen": -2.2669837474823,
871
- "logits/rejected": -2.1804003715515137,
872
- "logps/chosen": -674.4667358398438,
873
- "logps/rejected": -925.66650390625,
874
- "loss": 0.3574,
875
- "rewards/accuracies": 0.831250011920929,
876
- "rewards/chosen": -4.095311641693115,
877
- "rewards/margins": 2.3263769149780273,
878
- "rewards/rejected": -6.421689033508301,
879
  "step": 285
880
  },
881
  {
882
- "epoch": 0.6121372031662269,
883
- "grad_norm": 16.393917654235082,
884
  "learning_rate": 1.9591515500618588e-07,
885
- "logits/chosen": -2.3980906009674072,
886
- "logits/rejected": -2.307847261428833,
887
- "logps/chosen": -668.3873901367188,
888
- "logps/rejected": -880.75146484375,
889
- "loss": 0.4484,
890
- "rewards/accuracies": 0.824999988079071,
891
- "rewards/chosen": -4.07429313659668,
892
- "rewards/margins": 1.9223320484161377,
893
- "rewards/rejected": -5.9966254234313965,
894
  "step": 290
895
  },
896
  {
897
- "epoch": 0.6226912928759895,
898
- "grad_norm": 17.607056207364927,
899
  "learning_rate": 1.8693298614677112e-07,
900
- "logits/chosen": -2.1555488109588623,
901
- "logits/rejected": -2.051828145980835,
902
- "logps/chosen": -596.3387451171875,
903
- "logps/rejected": -825.14892578125,
904
- "loss": 0.3679,
905
  "rewards/accuracies": 0.875,
906
- "rewards/chosen": -3.2922072410583496,
907
- "rewards/margins": 2.1459250450134277,
908
- "rewards/rejected": -5.438132286071777,
909
  "step": 295
910
  },
911
  {
912
- "epoch": 0.633245382585752,
913
- "grad_norm": 18.598122517039727,
914
  "learning_rate": 1.7803695924219814e-07,
915
- "logits/chosen": -2.2622170448303223,
916
- "logits/rejected": -2.1897120475769043,
917
- "logps/chosen": -639.8846435546875,
918
- "logps/rejected": -850.0399169921875,
919
- "loss": 0.4031,
920
- "rewards/accuracies": 0.84375,
921
- "rewards/chosen": -3.6260199546813965,
922
- "rewards/margins": 2.0194387435913086,
923
- "rewards/rejected": -5.645459175109863,
924
  "step": 300
925
  },
926
  {
927
- "epoch": 0.6437994722955145,
928
- "grad_norm": 16.59129232266985,
929
  "learning_rate": 1.6923922519515067e-07,
930
- "logits/chosen": -2.2015440464019775,
931
- "logits/rejected": -2.129885196685791,
932
- "logps/chosen": -558.0819091796875,
933
- "logps/rejected": -752.4927368164062,
934
- "loss": 0.4095,
935
- "rewards/accuracies": 0.78125,
936
- "rewards/chosen": -3.0600318908691406,
937
- "rewards/margins": 1.7260305881500244,
938
- "rewards/rejected": -4.786062240600586,
939
  "step": 305
940
  },
941
  {
942
- "epoch": 0.6543535620052771,
943
- "grad_norm": 18.44006124052621,
944
  "learning_rate": 1.605518006520924e-07,
945
- "logits/chosen": -2.301358461380005,
946
- "logits/rejected": -2.184253215789795,
947
- "logps/chosen": -583.1818237304688,
948
- "logps/rejected": -801.277099609375,
949
- "loss": 0.3928,
950
- "rewards/accuracies": 0.8187500238418579,
951
- "rewards/chosen": -3.176647663116455,
952
- "rewards/margins": 2.0725767612457275,
953
- "rewards/rejected": -5.249224662780762,
954
  "step": 310
955
  },
956
  {
957
- "epoch": 0.6649076517150396,
958
- "grad_norm": 43.00212859415373,
959
  "learning_rate": 1.519865515899731e-07,
960
- "logits/chosen": -2.302088975906372,
961
- "logits/rejected": -2.1100873947143555,
962
- "logps/chosen": -601.4708251953125,
963
- "logps/rejected": -821.9664916992188,
964
- "loss": 0.3886,
965
- "rewards/accuracies": 0.862500011920929,
966
- "rewards/chosen": -3.5313808917999268,
967
- "rewards/margins": 1.985640287399292,
968
- "rewards/rejected": -5.517021179199219,
969
  "step": 315
970
  },
971
  {
972
- "epoch": 0.6754617414248021,
973
- "grad_norm": 19.308206012998415,
974
  "learning_rate": 1.4355517710873182e-07,
975
- "logits/chosen": -2.306898593902588,
976
- "logits/rejected": -2.2703452110290527,
977
- "logps/chosen": -637.2567138671875,
978
- "logps/rejected": -900.8046875,
979
- "loss": 0.3968,
980
- "rewards/accuracies": 0.8687499761581421,
981
- "rewards/chosen": -3.712189197540283,
982
- "rewards/margins": 2.495907783508301,
983
- "rewards/rejected": -6.208096981048584,
984
  "step": 320
985
  },
986
  {
987
- "epoch": 0.6860158311345647,
988
- "grad_norm": 24.048438044667563,
989
  "learning_rate": 1.3526919345173318e-07,
990
- "logits/chosen": -2.2532455921173096,
991
- "logits/rejected": -2.1278910636901855,
992
- "logps/chosen": -607.2129516601562,
993
- "logps/rejected": -847.1838989257812,
994
- "loss": 0.4058,
995
  "rewards/accuracies": 0.84375,
996
- "rewards/chosen": -3.399864912033081,
997
- "rewards/margins": 2.236896514892578,
998
- "rewards/rejected": -5.636761665344238,
999
  "step": 325
1000
  },
1001
  {
1002
- "epoch": 0.6965699208443272,
1003
- "grad_norm": 27.608112815101293,
1004
  "learning_rate": 1.2713991827596443e-07,
1005
- "logits/chosen": -2.233346939086914,
1006
- "logits/rejected": -2.2035372257232666,
1007
- "logps/chosen": -589.2955322265625,
1008
- "logps/rejected": -793.0179443359375,
1009
- "loss": 0.3905,
1010
- "rewards/accuracies": 0.793749988079071,
1011
- "rewards/chosen": -3.347853899002075,
1012
- "rewards/margins": 1.9631192684173584,
1013
- "rewards/rejected": -5.31097412109375,
1014
  "step": 330
1015
  },
1016
  {
1017
- "epoch": 0.7071240105540897,
1018
- "grad_norm": 24.14881546063451,
1019
  "learning_rate": 1.191784551934773e-07,
1020
- "logits/chosen": -2.3385255336761475,
1021
- "logits/rejected": -2.322145462036133,
1022
- "logps/chosen": -588.7033081054688,
1023
- "logps/rejected": -806.0431518554688,
1024
- "loss": 0.4061,
1025
- "rewards/accuracies": 0.8374999761581421,
1026
- "rewards/chosen": -3.393342971801758,
1027
- "rewards/margins": 2.0734634399414062,
1028
- "rewards/rejected": -5.466806888580322,
1029
  "step": 335
1030
  },
1031
  {
1032
- "epoch": 0.7176781002638523,
1033
- "grad_norm": 19.815903375155614,
1034
  "learning_rate": 1.1139567860518953e-07,
1035
- "logits/chosen": -2.0588958263397217,
1036
- "logits/rejected": -1.979034423828125,
1037
- "logps/chosen": -593.4244995117188,
1038
- "logps/rejected": -787.120361328125,
1039
- "loss": 0.4265,
1040
- "rewards/accuracies": 0.8187500238418579,
1041
- "rewards/chosen": -3.2387542724609375,
1042
- "rewards/margins": 1.854077696800232,
1043
- "rewards/rejected": -5.092832088470459,
1044
  "step": 340
1045
  },
1046
  {
1047
- "epoch": 0.7282321899736148,
1048
- "grad_norm": 20.071301052736302,
1049
  "learning_rate": 1.0380221884776128e-07,
1050
- "logits/chosen": -2.067850112915039,
1051
- "logits/rejected": -2.048149824142456,
1052
- "logps/chosen": -560.5596923828125,
1053
- "logps/rejected": -704.077880859375,
1054
- "loss": 0.4373,
1055
- "rewards/accuracies": 0.8187500238418579,
1056
- "rewards/chosen": -2.912015438079834,
1057
- "rewards/margins": 1.399601936340332,
1058
- "rewards/rejected": -4.311617374420166,
1059
  "step": 345
1060
  },
1061
  {
1062
- "epoch": 0.7387862796833773,
1063
- "grad_norm": 18.162804393534355,
1064
  "learning_rate": 9.640844767383405e-08,
1065
- "logits/chosen": -2.1664066314697266,
1066
- "logits/rejected": -2.08605694770813,
1067
- "logps/chosen": -543.7811279296875,
1068
- "logps/rejected": -715.3802490234375,
1069
- "loss": 0.4225,
1070
- "rewards/accuracies": 0.8999999761581421,
1071
- "rewards/chosen": -2.847567319869995,
1072
- "rewards/margins": 1.6358835697174072,
1073
- "rewards/rejected": -4.4834513664245605,
1074
  "step": 350
1075
  },
1076
  {
1077
- "epoch": 0.7493403693931399,
1078
- "grad_norm": 22.0345662189636,
1079
  "learning_rate": 8.922446408546378e-08,
1080
- "logits/chosen": -2.125089168548584,
1081
- "logits/rejected": -2.0595450401306152,
1082
- "logps/chosen": -593.4921875,
1083
- "logps/rejected": -794.3736572265625,
1084
- "loss": 0.4491,
1085
- "rewards/accuracies": 0.7875000238418579,
1086
- "rewards/chosen": -3.1958818435668945,
1087
- "rewards/margins": 1.8830915689468384,
1088
- "rewards/rejected": -5.078973293304443,
1089
  "step": 355
1090
  },
1091
  {
1092
- "epoch": 0.7598944591029023,
1093
- "grad_norm": 37.283205456222554,
1094
  "learning_rate": 8.22600805400994e-08,
1095
- "logits/chosen": -2.119860887527466,
1096
- "logits/rejected": -2.025869846343994,
1097
- "logps/chosen": -572.8893432617188,
1098
- "logps/rejected": -800.1495361328125,
1099
- "loss": 0.3879,
1100
- "rewards/accuracies": 0.856249988079071,
1101
- "rewards/chosen": -3.0747852325439453,
1102
- "rewards/margins": 2.0609545707702637,
1103
- "rewards/rejected": -5.135739326477051,
1104
  "step": 360
1105
  },
1106
  {
1107
- "epoch": 0.7704485488126649,
1108
- "grad_norm": 23.893898212231402,
1109
  "learning_rate": 7.552480954794558e-08,
1110
- "logits/chosen": -2.0981643199920654,
1111
- "logits/rejected": -2.010963201522827,
1112
- "logps/chosen": -598.5560302734375,
1113
- "logps/rejected": -791.58349609375,
1114
- "loss": 0.4217,
1115
- "rewards/accuracies": 0.793749988079071,
1116
- "rewards/chosen": -3.4471168518066406,
1117
- "rewards/margins": 1.8089519739151,
1118
- "rewards/rejected": -5.256069183349609,
1119
  "step": 365
1120
  },
1121
  {
1122
- "epoch": 0.7810026385224275,
1123
- "grad_norm": 27.916098925400245,
1124
  "learning_rate": 6.902785067901854e-08,
1125
- "logits/chosen": -2.1697256565093994,
1126
- "logits/rejected": -2.015242099761963,
1127
- "logps/chosen": -603.3410034179688,
1128
- "logps/rejected": -844.3304443359375,
1129
- "loss": 0.3863,
1130
- "rewards/accuracies": 0.8687499761581421,
1131
- "rewards/chosen": -3.3369498252868652,
1132
- "rewards/margins": 2.220611810684204,
1133
- "rewards/rejected": -5.557561874389648,
1134
  "step": 370
1135
  },
1136
  {
1137
- "epoch": 0.7915567282321899,
1138
- "grad_norm": 27.790853080729732,
1139
  "learning_rate": 6.277807799763973e-08,
1140
- "logits/chosen": -2.1927974224090576,
1141
- "logits/rejected": -2.077242136001587,
1142
- "logps/chosen": -604.877685546875,
1143
- "logps/rejected": -836.9320068359375,
1144
- "loss": 0.4036,
1145
- "rewards/accuracies": 0.8812500238418579,
1146
- "rewards/chosen": -3.460221767425537,
1147
- "rewards/margins": 2.2005088329315186,
1148
- "rewards/rejected": -5.660730361938477,
1149
  "step": 375
1150
  },
1151
  {
1152
- "epoch": 0.8021108179419525,
1153
- "grad_norm": 26.505958464528764,
1154
  "learning_rate": 5.678402794153145e-08,
1155
- "logits/chosen": -2.265552282333374,
1156
- "logits/rejected": -2.2187042236328125,
1157
- "logps/chosen": -644.1717529296875,
1158
- "logps/rejected": -856.8342895507812,
1159
- "loss": 0.4045,
1160
- "rewards/accuracies": 0.793749988079071,
1161
- "rewards/chosen": -3.8418126106262207,
1162
- "rewards/margins": 2.0028209686279297,
1163
- "rewards/rejected": -5.844632625579834,
1164
  "step": 380
1165
  },
1166
  {
1167
- "epoch": 0.8126649076517151,
1168
- "grad_norm": 28.453921770012606,
1169
  "learning_rate": 5.105388766206969e-08,
1170
- "logits/chosen": -2.355292797088623,
1171
- "logits/rejected": -2.2420361042022705,
1172
- "logps/chosen": -691.5671997070312,
1173
- "logps/rejected": -934.6068115234375,
1174
- "loss": 0.443,
1175
- "rewards/accuracies": 0.824999988079071,
1176
- "rewards/chosen": -4.222853660583496,
1177
- "rewards/margins": 2.315927028656006,
1178
- "rewards/rejected": -6.538781642913818,
1179
  "step": 385
1180
  },
1181
  {
1182
- "epoch": 0.8232189973614775,
1183
- "grad_norm": 21.35738866439425,
1184
  "learning_rate": 4.5595483841620484e-08,
1185
- "logits/chosen": -2.1776041984558105,
1186
- "logits/rejected": -2.1361899375915527,
1187
- "logps/chosen": -658.7529907226562,
1188
- "logps/rejected": -870.3590087890625,
1189
- "loss": 0.378,
1190
- "rewards/accuracies": 0.8187500238418579,
1191
- "rewards/chosen": -3.8739631175994873,
1192
- "rewards/margins": 2.0037648677825928,
1193
- "rewards/rejected": -5.877728462219238,
1194
  "step": 390
1195
  },
1196
  {
1197
- "epoch": 0.8337730870712401,
1198
- "grad_norm": 25.73671420821126,
1199
  "learning_rate": 4.0416272003232526e-08,
1200
- "logits/chosen": -2.1355865001678467,
1201
- "logits/rejected": -2.0880231857299805,
1202
- "logps/chosen": -632.5217895507812,
1203
- "logps/rejected": -861.2312622070312,
1204
- "loss": 0.44,
1205
  "rewards/accuracies": 0.856249988079071,
1206
- "rewards/chosen": -3.6603481769561768,
1207
- "rewards/margins": 2.233140230178833,
1208
- "rewards/rejected": -5.893488883972168,
1209
  "step": 395
1210
  },
1211
  {
1212
- "epoch": 0.8443271767810027,
1213
- "grad_norm": 24.57511432896418,
1214
  "learning_rate": 3.552332632729041e-08,
1215
- "logits/chosen": -2.040531873703003,
1216
- "logits/rejected": -2.077538251876831,
1217
- "logps/chosen": -628.4180908203125,
1218
- "logps/rejected": -797.4384765625,
1219
- "loss": 0.4222,
1220
- "rewards/accuracies": 0.762499988079071,
1221
- "rewards/chosen": -3.6359188556671143,
1222
- "rewards/margins": 1.6638940572738647,
1223
- "rewards/rejected": -5.299813270568848,
1224
  "step": 400
1225
  },
1226
  {
1227
- "epoch": 0.8443271767810027,
1228
- "eval_logits/chosen": -2.7593374252319336,
1229
- "eval_logits/rejected": -2.6865265369415283,
1230
- "eval_logps/chosen": -595.5198974609375,
1231
- "eval_logps/rejected": -786.4964599609375,
1232
- "eval_loss": 0.39839133620262146,
1233
- "eval_rewards/accuracies": 0.8286290168762207,
1234
- "eval_rewards/chosen": -3.3262782096862793,
1235
- "eval_rewards/margins": 1.799713134765625,
1236
- "eval_rewards/rejected": -5.125991344451904,
1237
- "eval_runtime": 315.3526,
1238
- "eval_samples_per_second": 6.266,
1239
- "eval_steps_per_second": 0.393,
1240
  "step": 400
1241
  },
1242
  {
1243
- "epoch": 0.8548812664907651,
1244
- "grad_norm": 20.4806371393051,
1245
  "learning_rate": 3.092332998903416e-08,
1246
- "logits/chosen": -2.1178812980651855,
1247
- "logits/rejected": -2.0984854698181152,
1248
- "logps/chosen": -637.4102783203125,
1249
- "logps/rejected": -846.5029296875,
1250
- "loss": 0.3953,
1251
- "rewards/accuracies": 0.856249988079071,
1252
- "rewards/chosen": -3.625606060028076,
1253
- "rewards/margins": 2.061870574951172,
1254
- "rewards/rejected": -5.687476634979248,
1255
  "step": 405
1256
  },
1257
  {
1258
- "epoch": 0.8654353562005277,
1259
- "grad_norm": 19.547579178485496,
1260
  "learning_rate": 2.6622566030146455e-08,
1261
- "logits/chosen": -2.1973793506622314,
1262
- "logits/rejected": -2.171604633331299,
1263
- "logps/chosen": -557.0053100585938,
1264
- "logps/rejected": -746.3121337890625,
1265
- "loss": 0.4256,
1266
- "rewards/accuracies": 0.831250011920929,
1267
- "rewards/chosen": -3.149854898452759,
1268
- "rewards/margins": 1.7529436349868774,
1269
- "rewards/rejected": -4.902798652648926,
1270
  "step": 410
1271
  },
1272
  {
1273
- "epoch": 0.8759894459102903,
1274
- "grad_norm": 19.129911424402337,
1275
  "learning_rate": 2.26269087768734e-08,
1276
- "logits/chosen": -2.1681036949157715,
1277
- "logits/rejected": -2.006333589553833,
1278
- "logps/chosen": -610.58837890625,
1279
- "logps/rejected": -869.3065185546875,
1280
- "loss": 0.3987,
1281
- "rewards/accuracies": 0.831250011920929,
1282
- "rewards/chosen": -3.558450698852539,
1283
- "rewards/margins": 2.386261463165283,
1284
- "rewards/rejected": -5.9447126388549805,
1285
  "step": 415
1286
  },
1287
  {
1288
- "epoch": 0.8865435356200527,
1289
- "grad_norm": 24.716365813368494,
1290
  "learning_rate": 1.894181581640106e-08,
1291
- "logits/chosen": -2.2324867248535156,
1292
- "logits/rejected": -2.2453224658966064,
1293
- "logps/chosen": -601.86083984375,
1294
- "logps/rejected": -790.0075073242188,
1295
- "loss": 0.3941,
1296
- "rewards/accuracies": 0.737500011920929,
1297
- "rewards/chosen": -3.448594331741333,
1298
- "rewards/margins": 1.7875158786773682,
1299
- "rewards/rejected": -5.236110210418701,
1300
  "step": 420
1301
  },
1302
  {
1303
- "epoch": 0.8970976253298153,
1304
- "grad_norm": 32.13959851586395,
1305
  "learning_rate": 1.5572320542448143e-08,
1306
- "logits/chosen": -2.2512707710266113,
1307
- "logits/rejected": -2.20418119430542,
1308
- "logps/chosen": -625.6372680664062,
1309
- "logps/rejected": -828.36083984375,
1310
- "loss": 0.4037,
1311
- "rewards/accuracies": 0.824999988079071,
1312
- "rewards/chosen": -3.624408006668091,
1313
- "rewards/margins": 1.9519973993301392,
1314
- "rewards/rejected": -5.576405048370361,
1315
  "step": 425
1316
  },
1317
  {
1318
- "epoch": 0.9076517150395779,
1319
- "grad_norm": 21.087098841456804,
1320
  "learning_rate": 1.2523025280255729e-08,
1321
- "logits/chosen": -2.314072847366333,
1322
- "logits/rejected": -2.28322434425354,
1323
- "logps/chosen": -619.37060546875,
1324
- "logps/rejected": -859.1106567382812,
1325
- "loss": 0.3474,
1326
- "rewards/accuracies": 0.862500011920929,
1327
- "rewards/chosen": -3.540250062942505,
1328
- "rewards/margins": 2.301335334777832,
1329
- "rewards/rejected": -5.841585159301758,
1330
  "step": 430
1331
  },
1332
  {
1333
- "epoch": 0.9182058047493403,
1334
- "grad_norm": 18.102509884061345,
1335
  "learning_rate": 9.798095000364214e-09,
1336
- "logits/chosen": -2.378577470779419,
1337
- "logits/rejected": -2.214040994644165,
1338
- "logps/chosen": -613.8382568359375,
1339
- "logps/rejected": -870.4904174804688,
1340
- "loss": 0.3723,
1341
- "rewards/accuracies": 0.8374999761581421,
1342
- "rewards/chosen": -3.5427989959716797,
1343
- "rewards/margins": 2.341104030609131,
1344
- "rewards/rejected": -5.8839030265808105,
1345
  "step": 435
1346
  },
1347
  {
1348
- "epoch": 0.9287598944591029,
1349
- "grad_norm": 24.09594523964464,
1350
  "learning_rate": 7.401251629764876e-09,
1351
- "logits/chosen": -2.230398416519165,
1352
- "logits/rejected": -2.044609308242798,
1353
- "logps/chosen": -635.7887573242188,
1354
- "logps/rejected": -865.6220703125,
1355
- "loss": 0.4132,
1356
- "rewards/accuracies": 0.800000011920929,
1357
- "rewards/chosen": -3.677701473236084,
1358
- "rewards/margins": 2.0847156047821045,
1359
- "rewards/rejected": -5.762416839599609,
1360
  "step": 440
1361
  },
1362
  {
1363
- "epoch": 0.9393139841688655,
1364
- "grad_norm": 18.69976567383702,
1365
  "learning_rate": 5.335768968195098e-09,
1366
- "logits/chosen": -2.1324424743652344,
1367
- "logits/rejected": -2.0235095024108887,
1368
- "logps/chosen": -618.6690673828125,
1369
- "logps/rejected": -826.8605346679688,
1370
- "loss": 0.4125,
1371
- "rewards/accuracies": 0.8187500238418579,
1372
- "rewards/chosen": -3.5293784141540527,
1373
- "rewards/margins": 1.9530022144317627,
1374
- "rewards/rejected": -5.4823808670043945,
1375
  "step": 445
1376
  },
1377
  {
1378
- "epoch": 0.9498680738786279,
1379
- "grad_norm": 15.92889127250539,
1380
  "learning_rate": 3.604468216521883e-09,
1381
- "logits/chosen": -2.2540245056152344,
1382
- "logits/rejected": -2.232203960418701,
1383
- "logps/chosen": -600.1151123046875,
1384
- "logps/rejected": -796.59423828125,
1385
- "loss": 0.3844,
1386
- "rewards/accuracies": 0.84375,
1387
- "rewards/chosen": -3.402463912963867,
1388
- "rewards/margins": 1.903550148010254,
1389
- "rewards/rejected": -5.306014060974121,
1390
  "step": 450
1391
  },
1392
  {
1393
- "epoch": 0.9604221635883905,
1394
- "grad_norm": 18.753569800561838,
1395
  "learning_rate": 2.2097141233206884e-09,
1396
- "logits/chosen": -2.1656556129455566,
1397
- "logits/rejected": -2.1333932876586914,
1398
- "logps/chosen": -624.7294921875,
1399
- "logps/rejected": -828.1585693359375,
1400
- "loss": 0.3908,
1401
- "rewards/accuracies": 0.8062499761581421,
1402
- "rewards/chosen": -3.7328314781188965,
1403
- "rewards/margins": 1.9572765827178955,
1404
- "rewards/rejected": -5.690107345581055,
1405
  "step": 455
1406
  },
1407
  {
1408
- "epoch": 0.9709762532981531,
1409
- "grad_norm": 19.85121890931105,
1410
  "learning_rate": 1.1534117549133472e-09,
1411
- "logits/chosen": -2.364999294281006,
1412
- "logits/rejected": -2.1894242763519287,
1413
- "logps/chosen": -624.747802734375,
1414
- "logps/rejected": -858.8040161132812,
1415
- "loss": 0.3658,
1416
- "rewards/accuracies": 0.8687499761581421,
1417
- "rewards/chosen": -3.7442946434020996,
1418
- "rewards/margins": 2.1538589000701904,
1419
- "rewards/rejected": -5.898154258728027,
1420
  "step": 460
1421
  },
1422
  {
1423
- "epoch": 0.9815303430079155,
1424
- "grad_norm": 39.10841866963654,
1425
  "learning_rate": 4.3700389327672173e-10,
1426
- "logits/chosen": -2.2868332862854004,
1427
- "logits/rejected": -2.1618874073028564,
1428
- "logps/chosen": -634.08447265625,
1429
- "logps/rejected": -845.2247924804688,
1430
- "loss": 0.3908,
1431
- "rewards/accuracies": 0.831250011920929,
1432
- "rewards/chosen": -3.793625593185425,
1433
- "rewards/margins": 1.9014127254486084,
1434
- "rewards/rejected": -5.695038318634033,
1435
  "step": 465
1436
  },
1437
  {
1438
- "epoch": 0.9920844327176781,
1439
- "grad_norm": 18.498519136680624,
1440
  "learning_rate": 6.146906537587982e-11,
1441
- "logits/chosen": -2.2575690746307373,
1442
- "logits/rejected": -2.1273903846740723,
1443
- "logps/chosen": -600.2813720703125,
1444
- "logps/rejected": -810.6456298828125,
1445
- "loss": 0.396,
1446
- "rewards/accuracies": 0.831250011920929,
1447
- "rewards/chosen": -3.4551138877868652,
1448
- "rewards/margins": 1.9376386404037476,
1449
- "rewards/rejected": -5.392752647399902,
1450
  "step": 470
1451
  },
1452
  {
1453
- "epoch": 0.9984168865435357,
1454
  "step": 473,
1455
  "total_flos": 0.0,
1456
- "train_loss": 0.466365703316622,
1457
- "train_runtime": 19524.7969,
1458
- "train_samples_per_second": 3.105,
1459
- "train_steps_per_second": 0.024
1460
  }
1461
  ],
1462
  "logging_steps": 5,
@@ -1477,7 +1477,7 @@
1477
  }
1478
  },
1479
  "total_flos": 0.0,
1480
- "train_batch_size": 2,
1481
  "trial_name": null,
1482
  "trial_params": null
1483
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9984827495217362,
5
  "eval_steps": 400,
6
  "global_step": 473,
7
  "is_hyper_param_search": false,
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.002110957187149548,
13
+ "grad_norm": 4.06041781261902,
14
  "learning_rate": 1.0416666666666666e-08,
15
+ "logits/chosen": -0.9878771901130676,
16
+ "logits/rejected": -0.7230668663978577,
17
+ "logps/chosen": -251.34963989257812,
18
+ "logps/rejected": -287.15838623046875,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
24
  "step": 1
25
  },
26
  {
27
+ "epoch": 0.01055478593574774,
28
+ "grad_norm": 4.105776204209711,
29
  "learning_rate": 5.208333333333333e-08,
30
+ "logits/chosen": -0.9975427985191345,
31
+ "logits/rejected": -0.6888133883476257,
32
+ "logps/chosen": -272.86993408203125,
33
+ "logps/rejected": -286.1126708984375,
34
+ "loss": 0.6932,
35
+ "rewards/accuracies": 0.359375,
36
+ "rewards/chosen": -0.00011302110215183347,
37
+ "rewards/margins": -0.00016400158347096294,
38
+ "rewards/rejected": 5.0980423111468554e-05,
39
  "step": 5
40
  },
41
  {
42
+ "epoch": 0.02110957187149548,
43
+ "grad_norm": 4.582740122598074,
44
  "learning_rate": 1.0416666666666667e-07,
45
+ "logits/chosen": -1.0306963920593262,
46
+ "logits/rejected": -0.7614760398864746,
47
+ "logps/chosen": -274.9240417480469,
48
+ "logps/rejected": -293.0102233886719,
49
  "loss": 0.6933,
50
+ "rewards/accuracies": 0.5625,
51
+ "rewards/chosen": 0.0007906880346126854,
52
+ "rewards/margins": 0.00033544833422638476,
53
+ "rewards/rejected": 0.00045523978769779205,
54
  "step": 10
55
  },
56
  {
57
+ "epoch": 0.03166435780724322,
58
+ "grad_norm": 4.2774752538157,
59
  "learning_rate": 1.5624999999999999e-07,
60
+ "logits/chosen": -0.9785528182983398,
61
+ "logits/rejected": -0.6860870122909546,
62
+ "logps/chosen": -253.0674285888672,
63
+ "logps/rejected": -277.63006591796875,
64
+ "loss": 0.693,
65
+ "rewards/accuracies": 0.44999998807907104,
66
+ "rewards/chosen": 0.00013054809824097902,
67
+ "rewards/margins": -4.017539322376251e-05,
68
+ "rewards/rejected": 0.0001707235351204872,
69
  "step": 15
70
  },
71
  {
72
+ "epoch": 0.04221914374299096,
73
+ "grad_norm": 4.154426876518502,
74
  "learning_rate": 2.0833333333333333e-07,
75
+ "logits/chosen": -0.8861902356147766,
76
+ "logits/rejected": -0.7466350793838501,
77
+ "logps/chosen": -303.89990234375,
78
+ "logps/rejected": -317.44354248046875,
79
+ "loss": 0.6929,
80
+ "rewards/accuracies": 0.48750001192092896,
81
+ "rewards/chosen": -0.0017729544779285789,
82
+ "rewards/margins": 0.0007360944291576743,
83
+ "rewards/rejected": -0.0025090486742556095,
84
  "step": 20
85
  },
86
  {
87
+ "epoch": 0.0527739296787387,
88
+ "grad_norm": 4.391172268221911,
89
  "learning_rate": 2.604166666666667e-07,
90
+ "logits/chosen": -0.9347459077835083,
91
+ "logits/rejected": -0.7482324242591858,
92
+ "logps/chosen": -257.277099609375,
93
+ "logps/rejected": -276.6146240234375,
94
+ "loss": 0.6925,
95
+ "rewards/accuracies": 0.5625,
96
+ "rewards/chosen": -0.0021017056424170732,
97
+ "rewards/margins": 0.001351111801341176,
98
+ "rewards/rejected": -0.003452816978096962,
99
  "step": 25
100
  },
101
  {
102
+ "epoch": 0.06332871561448644,
103
+ "grad_norm": 4.197922988205761,
104
  "learning_rate": 3.1249999999999997e-07,
105
+ "logits/chosen": -0.9495643377304077,
106
+ "logits/rejected": -0.722493052482605,
107
+ "logps/chosen": -271.043212890625,
108
+ "logps/rejected": -304.54779052734375,
109
+ "loss": 0.6909,
110
+ "rewards/accuracies": 0.675000011920929,
111
+ "rewards/chosen": -0.007976134307682514,
112
+ "rewards/margins": 0.006574218161404133,
113
+ "rewards/rejected": -0.014550352469086647,
114
  "step": 30
115
  },
116
  {
117
+ "epoch": 0.07388350155023418,
118
+ "grad_norm": 4.146928571575294,
119
  "learning_rate": 3.645833333333333e-07,
120
+ "logits/chosen": -0.9740797281265259,
121
+ "logits/rejected": -0.6511734127998352,
122
+ "logps/chosen": -269.66558837890625,
123
+ "logps/rejected": -282.5665588378906,
124
+ "loss": 0.6901,
125
+ "rewards/accuracies": 0.643750011920929,
126
+ "rewards/chosen": -0.014239413663744926,
127
+ "rewards/margins": 0.006516980938613415,
128
+ "rewards/rejected": -0.020756395533680916,
129
  "step": 35
130
  },
131
  {
132
+ "epoch": 0.08443828748598192,
133
+ "grad_norm": 4.434625304551974,
134
  "learning_rate": 4.1666666666666667e-07,
135
+ "logits/chosen": -0.9476411938667297,
136
+ "logits/rejected": -0.8075205087661743,
137
+ "logps/chosen": -254.89315795898438,
138
+ "logps/rejected": -269.08843994140625,
139
+ "loss": 0.6872,
140
+ "rewards/accuracies": 0.706250011920929,
141
+ "rewards/chosen": -0.025963688269257545,
142
+ "rewards/margins": 0.010111861862242222,
143
+ "rewards/rejected": -0.03607555106282234,
144
  "step": 40
145
  },
146
  {
147
+ "epoch": 0.09499307342172966,
148
+ "grad_norm": 4.334333725960549,
149
  "learning_rate": 4.6874999999999996e-07,
150
+ "logits/chosen": -0.9668118357658386,
151
+ "logits/rejected": -0.7818160057067871,
152
+ "logps/chosen": -262.55804443359375,
153
+ "logps/rejected": -279.33416748046875,
154
+ "loss": 0.6841,
155
+ "rewards/accuracies": 0.6499999761581421,
156
+ "rewards/chosen": -0.03984779864549637,
157
+ "rewards/margins": 0.01882680132985115,
158
+ "rewards/rejected": -0.05867459625005722,
159
  "step": 45
160
  },
161
  {
162
+ "epoch": 0.1055478593574774,
163
+ "grad_norm": 4.672589574927753,
164
  "learning_rate": 4.999726797933858e-07,
165
+ "logits/chosen": -1.1188008785247803,
166
+ "logits/rejected": -0.7495776414871216,
167
+ "logps/chosen": -274.8739318847656,
168
+ "logps/rejected": -298.7273254394531,
169
+ "loss": 0.6736,
170
+ "rewards/accuracies": 0.7562500238418579,
171
+ "rewards/chosen": -0.0697299912571907,
172
+ "rewards/margins": 0.04492691531777382,
173
+ "rewards/rejected": -0.11465690284967422,
174
  "step": 50
175
  },
176
  {
177
+ "epoch": 0.11610264529322514,
178
+ "grad_norm": 6.181545467894853,
179
  "learning_rate": 4.99665396039775e-07,
180
+ "logits/chosen": -1.1051629781723022,
181
+ "logits/rejected": -0.9252668619155884,
182
+ "logps/chosen": -270.87841796875,
183
+ "logps/rejected": -289.1105651855469,
184
+ "loss": 0.6576,
185
+ "rewards/accuracies": 0.731249988079071,
186
+ "rewards/chosen": -0.14251390099525452,
187
+ "rewards/margins": 0.10730250179767609,
188
+ "rewards/rejected": -0.2498163878917694,
189
  "step": 55
190
  },
191
  {
192
+ "epoch": 0.12665743122897288,
193
+ "grad_norm": 7.558458036407823,
194
  "learning_rate": 4.99017099386437e-07,
195
+ "logits/chosen": -1.3753994703292847,
196
+ "logits/rejected": -1.1374595165252686,
197
+ "logps/chosen": -306.73138427734375,
198
+ "logps/rejected": -341.1016845703125,
199
+ "loss": 0.6281,
200
  "rewards/accuracies": 0.75,
201
+ "rewards/chosen": -0.37672942876815796,
202
+ "rewards/margins": 0.20342817902565002,
203
+ "rewards/rejected": -0.5801576375961304,
204
  "step": 60
205
  },
206
  {
207
+ "epoch": 0.13721221716472062,
208
+ "grad_norm": 66.9078697860869,
209
  "learning_rate": 4.980286753286194e-07,
210
+ "logits/chosen": -1.5234780311584473,
211
+ "logits/rejected": -1.2897632122039795,
212
+ "logps/chosen": -359.58074951171875,
213
+ "logps/rejected": -414.0733947753906,
214
+ "loss": 0.6211,
215
+ "rewards/accuracies": 0.6625000238418579,
216
+ "rewards/chosen": -1.063786506652832,
217
+ "rewards/margins": 0.41773301362991333,
218
+ "rewards/rejected": -1.4815195798873901,
219
  "step": 65
220
  },
221
  {
222
+ "epoch": 0.14776700310046836,
223
+ "grad_norm": 12.864645326069963,
224
  "learning_rate": 4.967014739346915e-07,
225
+ "logits/chosen": -1.4892756938934326,
226
+ "logits/rejected": -1.3145514726638794,
227
+ "logps/chosen": -410.1991271972656,
228
+ "logps/rejected": -467.57623291015625,
229
+ "loss": 0.6578,
230
+ "rewards/accuracies": 0.7124999761581421,
231
+ "rewards/chosen": -1.4497530460357666,
232
+ "rewards/margins": 0.4113241136074066,
233
+ "rewards/rejected": -1.8610769510269165,
234
  "step": 70
235
  },
236
  {
237
+ "epoch": 0.1583217890362161,
238
+ "grad_norm": 12.52823747211519,
239
  "learning_rate": 4.950373080021136e-07,
240
+ "logits/chosen": -1.2871811389923096,
241
+ "logits/rejected": -1.2107694149017334,
242
+ "logps/chosen": -333.7770080566406,
243
+ "logps/rejected": -356.77728271484375,
244
+ "loss": 0.5936,
245
+ "rewards/accuracies": 0.6875,
246
+ "rewards/chosen": -0.579529881477356,
247
+ "rewards/margins": 0.2453722506761551,
248
+ "rewards/rejected": -0.8249020576477051,
249
  "step": 75
250
  },
251
  {
252
+ "epoch": 0.16887657497196384,
253
+ "grad_norm": 17.45417402457315,
254
  "learning_rate": 4.930384505813737e-07,
255
+ "logits/chosen": -1.4544992446899414,
256
+ "logits/rejected": -1.3903002738952637,
257
+ "logps/chosen": -392.7482604980469,
258
+ "logps/rejected": -472.0469665527344,
259
+ "loss": 0.5588,
260
+ "rewards/accuracies": 0.762499988079071,
261
+ "rewards/chosen": -1.0880345106124878,
262
+ "rewards/margins": 0.6102110147476196,
263
+ "rewards/rejected": -1.6982454061508179,
264
  "step": 80
265
  },
266
  {
267
+ "epoch": 0.17943136090771158,
268
+ "grad_norm": 36.09231440231977,
269
  "learning_rate": 4.907076318712738e-07,
270
+ "logits/chosen": -1.5770976543426514,
271
+ "logits/rejected": -1.4604318141937256,
272
+ "logps/chosen": -478.90264892578125,
273
+ "logps/rejected": -560.0563354492188,
274
+ "loss": 0.557,
275
+ "rewards/accuracies": 0.699999988079071,
276
+ "rewards/chosen": -1.8632774353027344,
277
+ "rewards/margins": 0.6757477521896362,
278
+ "rewards/rejected": -2.53902530670166,
279
  "step": 85
280
  },
281
  {
282
+ "epoch": 0.18998614684345932,
283
+ "grad_norm": 12.93821128098639,
284
  "learning_rate": 4.88048035489807e-07,
285
+ "logits/chosen": -1.5192902088165283,
286
+ "logits/rejected": -1.3578670024871826,
287
+ "logps/chosen": -401.8139343261719,
288
+ "logps/rejected": -476.27557373046875,
289
+ "loss": 0.5413,
290
+ "rewards/accuracies": 0.7875000238418579,
291
+ "rewards/chosen": -1.2336255311965942,
292
+ "rewards/margins": 0.6196562051773071,
293
+ "rewards/rejected": -1.8532816171646118,
294
  "step": 90
295
  },
296
  {
297
+ "epoch": 0.20054093277920707,
298
+ "grad_norm": 10.174846260495245,
299
  "learning_rate": 4.85063294125718e-07,
300
+ "logits/chosen": -1.4184257984161377,
301
+ "logits/rejected": -1.349448561668396,
302
+ "logps/chosen": -405.37030029296875,
303
+ "logps/rejected": -465.87835693359375,
304
+ "loss": 0.584,
305
+ "rewards/accuracies": 0.7437499761581421,
306
+ "rewards/chosen": -1.2496789693832397,
307
+ "rewards/margins": 0.4809112548828125,
308
+ "rewards/rejected": -1.7305902242660522,
309
  "step": 95
310
  },
311
  {
312
+ "epoch": 0.2110957187149548,
313
+ "grad_norm": 11.934921621793764,
314
  "learning_rate": 4.817574845766874e-07,
315
+ "logits/chosen": -1.6645218133926392,
316
+ "logits/rejected": -1.4362655878067017,
317
+ "logps/chosen": -419.6376953125,
318
+ "logps/rejected": -511.97955322265625,
319
+ "loss": 0.5179,
320
  "rewards/accuracies": 0.7749999761581421,
321
+ "rewards/chosen": -1.6008678674697876,
322
+ "rewards/margins": 0.7449867129325867,
323
+ "rewards/rejected": -2.3458542823791504,
324
  "step": 100
325
  },
326
  {
327
+ "epoch": 0.22165050465070255,
328
+ "grad_norm": 11.911318908319176,
329
  "learning_rate": 4.781351221809166e-07,
330
+ "logits/chosen": -1.6439392566680908,
331
+ "logits/rejected": -1.5432502031326294,
332
+ "logps/chosen": -462.12249755859375,
333
+ "logps/rejected": -561.0224609375,
334
+ "loss": 0.563,
335
+ "rewards/accuracies": 0.737500011920929,
336
+ "rewards/chosen": -2.0325913429260254,
337
+ "rewards/margins": 0.8397369384765625,
338
+ "rewards/rejected": -2.872328281402588,
339
  "step": 105
340
  },
341
  {
342
+ "epoch": 0.23220529058645029,
343
+ "grad_norm": 12.017204037189543,
344
  "learning_rate": 4.742011546497182e-07,
345
+ "logits/chosen": -1.5051389932632446,
346
+ "logits/rejected": -1.4414231777191162,
347
+ "logps/chosen": -449.30657958984375,
348
+ "logps/rejected": -550.5023803710938,
349
+ "loss": 0.5068,
350
+ "rewards/accuracies": 0.8062499761581421,
351
+ "rewards/chosen": -1.5960687398910522,
352
+ "rewards/margins": 0.8259360194206238,
353
+ "rewards/rejected": -2.4220046997070312,
354
  "step": 110
355
  },
356
  {
357
+ "epoch": 0.24276007652219803,
358
+ "grad_norm": 16.38172474746581,
359
  "learning_rate": 4.6996095530953875e-07,
360
+ "logits/chosen": -1.6456964015960693,
361
+ "logits/rejected": -1.4991674423217773,
362
+ "logps/chosen": -492.7826232910156,
363
+ "logps/rejected": -631.8049926757812,
364
+ "loss": 0.5084,
365
+ "rewards/accuracies": 0.824999988079071,
366
+ "rewards/chosen": -2.1061949729919434,
367
+ "rewards/margins": 1.1719015836715698,
368
+ "rewards/rejected": -3.2780966758728027,
369
  "step": 115
370
  },
371
  {
372
+ "epoch": 0.25331486245794577,
373
+ "grad_norm": 19.153944069009537,
374
  "learning_rate": 4.654203157626399e-07,
375
+ "logits/chosen": -1.7168292999267578,
376
+ "logits/rejected": -1.5979254245758057,
377
+ "logps/chosen": -499.25775146484375,
378
+ "logps/rejected": -663.107177734375,
379
+ "loss": 0.5023,
380
+ "rewards/accuracies": 0.762499988079071,
381
+ "rewards/chosen": -2.420952558517456,
382
+ "rewards/margins": 1.3582748174667358,
383
+ "rewards/rejected": -3.7792270183563232,
384
  "step": 120
385
  },
386
  {
387
+ "epoch": 0.26386964839369353,
388
+ "grad_norm": 15.209663016177657,
389
  "learning_rate": 4.605854379764673e-07,
390
+ "logits/chosen": -1.529827356338501,
391
+ "logits/rejected": -1.4732040166854858,
392
+ "logps/chosen": -422.29827880859375,
393
+ "logps/rejected": -505.83154296875,
394
+ "loss": 0.4999,
395
+ "rewards/accuracies": 0.800000011920929,
396
+ "rewards/chosen": -1.6309483051300049,
397
+ "rewards/margins": 0.74617999792099,
398
+ "rewards/rejected": -2.3771283626556396,
399
  "step": 125
400
  },
401
  {
402
+ "epoch": 0.27442443432944125,
403
+ "grad_norm": 11.779884218050361,
404
  "learning_rate": 4.5546292581250857e-07,
405
+ "logits/chosen": -1.5948470830917358,
406
+ "logits/rejected": -1.4933321475982666,
407
+ "logps/chosen": -455.269287109375,
408
+ "logps/rejected": -543.4944458007812,
409
+ "loss": 0.4868,
410
+ "rewards/accuracies": 0.8062499761581421,
411
+ "rewards/chosen": -1.7508251667022705,
412
+ "rewards/margins": 0.8269698023796082,
413
+ "rewards/rejected": -2.5777950286865234,
414
  "step": 130
415
  },
416
  {
417
+ "epoch": 0.284979220265189,
418
+ "grad_norm": 19.10994211444829,
419
  "learning_rate": 4.5005977600621275e-07,
420
+ "logits/chosen": -1.7589390277862549,
421
+ "logits/rejected": -1.5872992277145386,
422
+ "logps/chosen": -568.4318237304688,
423
+ "logps/rejected": -727.857666015625,
424
+ "loss": 0.4781,
425
+ "rewards/accuracies": 0.75,
426
+ "rewards/chosen": -2.8650763034820557,
427
+ "rewards/margins": 1.3692649602890015,
428
+ "rewards/rejected": -4.234341621398926,
429
  "step": 135
430
  },
431
  {
432
+ "epoch": 0.2955340062009367,
433
+ "grad_norm": 15.492507693472131,
434
  "learning_rate": 4.443833686102919e-07,
435
+ "logits/chosen": -1.6332670450210571,
436
+ "logits/rejected": -1.4034180641174316,
437
+ "logps/chosen": -461.40826416015625,
438
+ "logps/rejected": -608.6381225585938,
439
+ "loss": 0.4705,
440
+ "rewards/accuracies": 0.824999988079071,
441
+ "rewards/chosen": -1.9456875324249268,
442
+ "rewards/margins": 1.1761558055877686,
443
+ "rewards/rejected": -3.1218433380126953,
444
  "step": 140
445
  },
446
  {
447
+ "epoch": 0.3060887921366845,
448
+ "grad_norm": 16.867870056444893,
449
  "learning_rate": 4.384414569144561e-07,
450
+ "logits/chosen": -1.5909126996994019,
451
+ "logits/rejected": -1.5102109909057617,
452
+ "logps/chosen": -441.18670654296875,
453
+ "logps/rejected": -565.0006713867188,
454
+ "loss": 0.5053,
455
+ "rewards/accuracies": 0.793749988079071,
456
+ "rewards/chosen": -1.909717321395874,
457
+ "rewards/margins": 1.0274746417999268,
458
+ "rewards/rejected": -2.937191963195801,
459
  "step": 145
460
  },
461
  {
462
+ "epoch": 0.3166435780724322,
463
+ "grad_norm": 12.53246744632567,
464
  "learning_rate": 4.3224215685535287e-07,
465
+ "logits/chosen": -1.6234986782073975,
466
+ "logits/rejected": -1.4319360256195068,
467
+ "logps/chosen": -540.7374267578125,
468
+ "logps/rejected": -702.7229614257812,
469
+ "loss": 0.4621,
470
+ "rewards/accuracies": 0.7749999761581421,
471
+ "rewards/chosen": -2.5035765171051025,
472
+ "rewards/margins": 1.4901249408721924,
473
+ "rewards/rejected": -3.993701457977295,
474
  "step": 150
475
  },
476
  {
477
+ "epoch": 0.32719836400818,
478
+ "grad_norm": 15.04185556896974,
479
  "learning_rate": 4.2579393593117364e-07,
480
+ "logits/chosen": -1.533140778541565,
481
+ "logits/rejected": -1.3760929107666016,
482
+ "logps/chosen": -481.59954833984375,
483
+ "logps/rejected": -652.427490234375,
484
+ "loss": 0.4563,
485
+ "rewards/accuracies": 0.8187500238418579,
486
+ "rewards/chosen": -2.2860851287841797,
487
+ "rewards/margins": 1.5269566774368286,
488
+ "rewards/rejected": -3.8130416870117188,
489
  "step": 155
490
  },
491
  {
492
+ "epoch": 0.3377531499439277,
493
+ "grad_norm": 13.12290477739756,
494
  "learning_rate": 4.191056016360699e-07,
495
+ "logits/chosen": -1.6271283626556396,
496
+ "logits/rejected": -1.5437796115875244,
497
+ "logps/chosen": -519.0302734375,
498
+ "logps/rejected": -663.0972900390625,
499
+ "loss": 0.4474,
500
+ "rewards/accuracies": 0.7562500238418579,
501
+ "rewards/chosen": -2.57169771194458,
502
+ "rewards/margins": 1.3303985595703125,
503
+ "rewards/rejected": -3.9020965099334717,
504
  "step": 160
505
  },
506
  {
507
+ "epoch": 0.34830793587967546,
508
+ "grad_norm": 15.784620667841397,
509
  "learning_rate": 4.121862894301754e-07,
510
+ "logits/chosen": -1.5655263662338257,
511
+ "logits/rejected": -1.5315742492675781,
512
+ "logps/chosen": -580.234375,
513
+ "logps/rejected": -725.3365478515625,
514
+ "loss": 0.4634,
515
+ "rewards/accuracies": 0.7562500238418579,
516
+ "rewards/chosen": -2.9336700439453125,
517
+ "rewards/margins": 1.4053281545639038,
518
+ "rewards/rejected": -4.338997840881348,
519
  "step": 165
520
  },
521
  {
522
+ "epoch": 0.35886272181542317,
523
+ "grad_norm": 18.950750373982537,
524
  "learning_rate": 4.050454502616667e-07,
525
+ "logits/chosen": -1.7045695781707764,
526
+ "logits/rejected": -1.6517536640167236,
527
+ "logps/chosen": -554.7913208007812,
528
+ "logps/rejected": -716.6380615234375,
529
+ "loss": 0.4487,
530
+ "rewards/accuracies": 0.8062499761581421,
531
+ "rewards/chosen": -2.823065757751465,
532
+ "rewards/margins": 1.4535109996795654,
533
+ "rewards/rejected": -4.276576519012451,
534
  "step": 170
535
  },
536
  {
537
+ "epoch": 0.36941750775117094,
538
+ "grad_norm": 17.91495880964947,
539
  "learning_rate": 3.976928376579047e-07,
540
+ "logits/chosen": -1.6950843334197998,
541
+ "logits/rejected": -1.582262396812439,
542
+ "logps/chosen": -520.1227416992188,
543
+ "logps/rejected": -664.7024536132812,
544
+ "loss": 0.445,
545
+ "rewards/accuracies": 0.824999988079071,
546
+ "rewards/chosen": -2.5456645488739014,
547
+ "rewards/margins": 1.2352759838104248,
548
+ "rewards/rejected": -3.780940532684326,
549
  "step": 175
550
  },
551
  {
552
+ "epoch": 0.37997229368691865,
553
+ "grad_norm": 15.17521580280272,
554
  "learning_rate": 3.9013849440328945e-07,
555
+ "logits/chosen": -1.6940500736236572,
556
+ "logits/rejected": -1.5467922687530518,
557
+ "logps/chosen": -520.3582763671875,
558
+ "logps/rejected": -680.7635498046875,
559
+ "loss": 0.4665,
560
+ "rewards/accuracies": 0.8125,
561
+ "rewards/chosen": -2.551764726638794,
562
+ "rewards/margins": 1.372143268585205,
563
+ "rewards/rejected": -3.92390775680542,
564
  "step": 180
565
  },
566
  {
567
+ "epoch": 0.3905270796226664,
568
+ "grad_norm": 14.308326454629483,
569
  "learning_rate": 3.8239273882202473e-07,
570
+ "logits/chosen": -1.6619869470596313,
571
+ "logits/rejected": -1.5419933795928955,
572
+ "logps/chosen": -506.9136657714844,
573
+ "logps/rejected": -639.9857177734375,
574
+ "loss": 0.4574,
575
+ "rewards/accuracies": 0.737500011920929,
576
+ "rewards/chosen": -2.3956522941589355,
577
+ "rewards/margins": 1.23929762840271,
578
+ "rewards/rejected": -3.6349494457244873,
579
  "step": 185
580
  },
581
  {
582
+ "epoch": 0.40108186555841413,
583
+ "grad_norm": 15.767200038387838,
584
  "learning_rate": 3.7446615068452804e-07,
585
+ "logits/chosen": -1.6123485565185547,
586
+ "logits/rejected": -1.4534804821014404,
587
+ "logps/chosen": -510.5772399902344,
588
+ "logps/rejected": -681.8480224609375,
589
+ "loss": 0.4326,
590
  "rewards/accuracies": 0.831250011920929,
591
+ "rewards/chosen": -2.398505687713623,
592
+ "rewards/margins": 1.5248647928237915,
593
+ "rewards/rejected": -3.923370838165283,
594
  "step": 190
595
  },
596
  {
597
+ "epoch": 0.4116366514941619,
598
+ "grad_norm": 16.282324670439472,
599
  "learning_rate": 3.6636955675673743e-07,
600
+ "logits/chosen": -1.6264712810516357,
601
+ "logits/rejected": -1.4750279188156128,
602
+ "logps/chosen": -514.038330078125,
603
+ "logps/rejected": -647.3781127929688,
604
+ "loss": 0.4404,
605
+ "rewards/accuracies": 0.8062499761581421,
606
+ "rewards/chosen": -2.37794828414917,
607
+ "rewards/margins": 1.2105674743652344,
608
+ "rewards/rejected": -3.5885162353515625,
609
  "step": 195
610
  },
611
  {
612
+ "epoch": 0.4221914374299096,
613
+ "grad_norm": 21.312257244757074,
614
  "learning_rate": 3.5811401601205093e-07,
615
+ "logits/chosen": -1.5191190242767334,
616
+ "logits/rejected": -1.5369209051132202,
617
+ "logps/chosen": -559.904052734375,
618
+ "logps/rejected": -721.0767211914062,
619
+ "loss": 0.4493,
620
+ "rewards/accuracies": 0.8374999761581421,
621
+ "rewards/chosen": -2.823049545288086,
622
+ "rewards/margins": 1.5707520246505737,
623
+ "rewards/rejected": -4.393801689147949,
624
  "step": 200
625
  },
626
  {
627
+ "epoch": 0.4327462233656574,
628
+ "grad_norm": 18.951174802021,
629
  "learning_rate": 3.497108045260995e-07,
630
+ "logits/chosen": -1.6123138666152954,
631
+ "logits/rejected": -1.5209693908691406,
632
+ "logps/chosen": -508.78436279296875,
633
+ "logps/rejected": -680.7193603515625,
634
+ "loss": 0.4287,
635
+ "rewards/accuracies": 0.84375,
636
+ "rewards/chosen": -2.3978936672210693,
637
+ "rewards/margins": 1.566540002822876,
638
+ "rewards/rejected": -3.9644336700439453,
639
  "step": 205
640
  },
641
  {
642
+ "epoch": 0.4433010093014051,
643
+ "grad_norm": 28.219925216993275,
644
  "learning_rate": 3.411714000749838e-07,
645
+ "logits/chosen": -1.661116361618042,
646
+ "logits/rejected": -1.5592620372772217,
647
+ "logps/chosen": -547.822998046875,
648
+ "logps/rejected": -737.5689697265625,
649
+ "loss": 0.4248,
650
+ "rewards/accuracies": 0.84375,
651
+ "rewards/chosen": -2.826833724975586,
652
+ "rewards/margins": 1.727709412574768,
653
+ "rewards/rejected": -4.5545430183410645,
654
  "step": 210
655
  },
656
  {
657
+ "epoch": 0.45385579523715286,
658
+ "grad_norm": 27.445717175281757,
659
  "learning_rate": 3.3250746645801287e-07,
660
+ "logits/chosen": -1.6852436065673828,
661
+ "logits/rejected": -1.6252915859222412,
662
+ "logps/chosen": -603.220947265625,
663
+ "logps/rejected": -795.3065185546875,
664
+ "loss": 0.4409,
665
+ "rewards/accuracies": 0.800000011920929,
666
+ "rewards/chosen": -3.4767067432403564,
667
+ "rewards/margins": 1.7523845434188843,
668
+ "rewards/rejected": -5.229090690612793,
669
  "step": 215
670
  },
671
  {
672
+ "epoch": 0.46441058117290057,
673
+ "grad_norm": 16.007752631407985,
674
  "learning_rate": 3.237308375663571e-07,
675
+ "logits/chosen": -1.762291669845581,
676
+ "logits/rejected": -1.5295162200927734,
677
+ "logps/chosen": -531.2833862304688,
678
+ "logps/rejected": -725.7946166992188,
679
+ "loss": 0.3819,
680
+ "rewards/accuracies": 0.875,
681
+ "rewards/chosen": -2.7329602241516113,
682
+ "rewards/margins": 1.7636291980743408,
683
+ "rewards/rejected": -4.496589660644531,
684
  "step": 220
685
  },
686
  {
687
+ "epoch": 0.47496536710864834,
688
+ "grad_norm": 18.506003391111292,
689
  "learning_rate": 3.148535012193767e-07,
690
+ "logits/chosen": -1.754020094871521,
691
+ "logits/rejected": -1.6287786960601807,
692
+ "logps/chosen": -556.3197631835938,
693
+ "logps/rejected": -728.1083984375,
694
+ "loss": 0.3855,
695
+ "rewards/accuracies": 0.862500011920929,
696
+ "rewards/chosen": -2.739107370376587,
697
+ "rewards/margins": 1.6264499425888062,
698
+ "rewards/rejected": -4.3655571937561035,
699
  "step": 225
700
  },
701
  {
702
+ "epoch": 0.48552015304439605,
703
+ "grad_norm": 20.68896480009483,
704
  "learning_rate": 3.0588758279070183e-07,
705
+ "logits/chosen": -1.6177418231964111,
706
+ "logits/rejected": -1.5763094425201416,
707
+ "logps/chosen": -562.428955078125,
708
+ "logps/rejected": -753.3512573242188,
709
+ "loss": 0.4092,
710
  "rewards/accuracies": 0.824999988079071,
711
+ "rewards/chosen": -2.9177470207214355,
712
+ "rewards/margins": 1.7720897197723389,
713
+ "rewards/rejected": -4.6898369789123535,
714
  "step": 230
715
  },
716
  {
717
+ "epoch": 0.4960749389801438,
718
+ "grad_norm": 16.381736251716195,
719
  "learning_rate": 2.968453286464312e-07,
720
+ "logits/chosen": -1.4933600425720215,
721
+ "logits/rejected": -1.5784003734588623,
722
+ "logps/chosen": -504.9471740722656,
723
+ "logps/rejected": -648.5574951171875,
724
+ "loss": 0.4179,
725
+ "rewards/accuracies": 0.762499988079071,
726
+ "rewards/chosen": -2.378976345062256,
727
+ "rewards/margins": 1.3860208988189697,
728
+ "rewards/rejected": -3.7649970054626465,
729
  "step": 235
730
  },
731
  {
732
+ "epoch": 0.5066297249158915,
733
+ "grad_norm": 22.296205809275865,
734
  "learning_rate": 2.8773908941806877e-07,
735
+ "logits/chosen": -1.6254231929779053,
736
+ "logits/rejected": -1.621469259262085,
737
+ "logps/chosen": -599.214599609375,
738
+ "logps/rejected": -809.9522705078125,
739
+ "loss": 0.39,
740
+ "rewards/accuracies": 0.84375,
741
+ "rewards/chosen": -3.3130722045898438,
742
+ "rewards/margins": 1.939295768737793,
743
+ "rewards/rejected": -5.2523674964904785,
744
  "step": 240
745
  },
746
  {
747
+ "epoch": 0.5171845108516393,
748
+ "grad_norm": 17.974418608052964,
749
  "learning_rate": 2.785813031330473e-07,
750
+ "logits/chosen": -1.6836649179458618,
751
+ "logits/rejected": -1.6946824789047241,
752
+ "logps/chosen": -636.5640869140625,
753
+ "logps/rejected": -862.1500244140625,
754
+ "loss": 0.407,
755
+ "rewards/accuracies": 0.8187500238418579,
756
+ "rewards/chosen": -3.709088087081909,
757
+ "rewards/margins": 2.079369068145752,
758
+ "rewards/rejected": -5.788456916809082,
759
  "step": 245
760
  },
761
  {
762
+ "epoch": 0.5277392967873871,
763
+ "grad_norm": 21.869860477038394,
764
  "learning_rate": 2.693844782258779e-07,
765
+ "logits/chosen": -1.597246766090393,
766
+ "logits/rejected": -1.4808999300003052,
767
+ "logps/chosen": -554.4249877929688,
768
+ "logps/rejected": -735.31103515625,
769
+ "loss": 0.377,
770
+ "rewards/accuracies": 0.8374999761581421,
771
+ "rewards/chosen": -2.8226776123046875,
772
+ "rewards/margins": 1.673805832862854,
773
+ "rewards/rejected": -4.49648380279541,
774
  "step": 250
775
  },
776
  {
777
+ "epoch": 0.5382940827231347,
778
+ "grad_norm": 35.342954704070486,
779
  "learning_rate": 2.601611764531342e-07,
780
+ "logits/chosen": -1.599726676940918,
781
+ "logits/rejected": -1.5378262996673584,
782
+ "logps/chosen": -625.1596069335938,
783
+ "logps/rejected": -838.8173828125,
784
+ "loss": 0.3907,
785
+ "rewards/accuracies": 0.831250011920929,
786
+ "rewards/chosen": -3.5563435554504395,
787
+ "rewards/margins": 1.9609460830688477,
788
+ "rewards/rejected": -5.517289638519287,
789
  "step": 255
790
  },
791
  {
792
+ "epoch": 0.5488488686588825,
793
+ "grad_norm": 17.9297074159325,
794
  "learning_rate": 2.5092399573560323e-07,
795
+ "logits/chosen": -1.6411758661270142,
796
+ "logits/rejected": -1.6522302627563477,
797
+ "logps/chosen": -658.8689575195312,
798
+ "logps/rejected": -871.1637573242188,
799
+ "loss": 0.4168,
800
+ "rewards/accuracies": 0.8187500238418579,
801
+ "rewards/chosen": -3.866881847381592,
802
+ "rewards/margins": 2.113030195236206,
803
+ "rewards/rejected": -5.979912757873535,
804
  "step": 260
805
  },
806
  {
807
+ "epoch": 0.5594036545946303,
808
+ "grad_norm": 16.443040045807887,
809
  "learning_rate": 2.4168555295104124e-07,
810
+ "logits/chosen": -1.5852059125900269,
811
+ "logits/rejected": -1.5837304592132568,
812
+ "logps/chosen": -594.8711547851562,
813
+ "logps/rejected": -801.4606323242188,
814
+ "loss": 0.4093,
815
+ "rewards/accuracies": 0.8374999761581421,
816
+ "rewards/chosen": -3.3196072578430176,
817
+ "rewards/margins": 1.8402855396270752,
818
+ "rewards/rejected": -5.159893035888672,
819
  "step": 265
820
  },
821
  {
822
+ "epoch": 0.569958440530378,
823
+ "grad_norm": 15.035798283991243,
824
  "learning_rate": 2.3245846670103626e-07,
825
+ "logits/chosen": -1.5612332820892334,
826
+ "logits/rejected": -1.5125606060028076,
827
+ "logps/chosen": -579.2890014648438,
828
+ "logps/rejected": -773.1954956054688,
829
+ "loss": 0.3769,
830
+ "rewards/accuracies": 0.8374999761581421,
831
+ "rewards/chosen": -3.0682718753814697,
832
+ "rewards/margins": 1.7783126831054688,
833
+ "rewards/rejected": -4.846584320068359,
834
  "step": 270
835
  },
836
  {
837
+ "epoch": 0.5805132264661257,
838
+ "grad_norm": 20.761167206822886,
839
  "learning_rate": 2.232553400755159e-07,
840
+ "logits/chosen": -1.6236953735351562,
841
+ "logits/rejected": -1.5238358974456787,
842
+ "logps/chosen": -613.8088989257812,
843
+ "logps/rejected": -867.7283325195312,
844
+ "loss": 0.3697,
845
+ "rewards/accuracies": 0.8500000238418579,
846
+ "rewards/chosen": -3.327275037765503,
847
+ "rewards/margins": 2.372307538986206,
848
+ "rewards/rejected": -5.699582576751709,
849
  "step": 275
850
  },
851
  {
852
+ "epoch": 0.5910680124018735,
853
+ "grad_norm": 20.416122820074975,
854
  "learning_rate": 2.1408874343844294e-07,
855
+ "logits/chosen": -1.6903560161590576,
856
+ "logits/rejected": -1.5515328645706177,
857
+ "logps/chosen": -648.6663818359375,
858
+ "logps/rejected": -949.74609375,
859
+ "loss": 0.3701,
860
+ "rewards/accuracies": 0.8374999761581421,
861
+ "rewards/chosen": -3.67596435546875,
862
+ "rewards/margins": 2.625474214553833,
863
+ "rewards/rejected": -6.301438808441162,
864
  "step": 280
865
  },
866
  {
867
+ "epoch": 0.6016227983376212,
868
+ "grad_norm": 20.954462877817495,
869
  "learning_rate": 2.049711972582101e-07,
870
+ "logits/chosen": -1.7400896549224854,
871
+ "logits/rejected": -1.6318342685699463,
872
+ "logps/chosen": -647.7393798828125,
873
+ "logps/rejected": -899.9304809570312,
874
+ "loss": 0.3719,
875
+ "rewards/accuracies": 0.8125,
876
+ "rewards/chosen": -3.6654484272003174,
877
+ "rewards/margins": 2.371953010559082,
878
+ "rewards/rejected": -6.03740119934082,
879
  "step": 285
880
  },
881
  {
882
+ "epoch": 0.612177584273369,
883
+ "grad_norm": 17.14928736018049,
884
  "learning_rate": 1.9591515500618588e-07,
885
+ "logits/chosen": -1.6016016006469727,
886
+ "logits/rejected": -1.5183677673339844,
887
+ "logps/chosen": -670.8876342773438,
888
+ "logps/rejected": -865.2810668945312,
889
+ "loss": 0.4464,
890
+ "rewards/accuracies": 0.793749988079071,
891
+ "rewards/chosen": -3.901280641555786,
892
+ "rewards/margins": 1.737006425857544,
893
+ "rewards/rejected": -5.638287544250488,
894
  "step": 290
895
  },
896
  {
897
+ "epoch": 0.6227323702091166,
898
+ "grad_norm": 15.631817925073218,
899
  "learning_rate": 1.8693298614677112e-07,
900
+ "logits/chosen": -1.4730761051177979,
901
+ "logits/rejected": -1.3837854862213135,
902
+ "logps/chosen": -539.0037231445312,
903
+ "logps/rejected": -730.4061279296875,
904
+ "loss": 0.3841,
905
  "rewards/accuracies": 0.875,
906
+ "rewards/chosen": -2.58305025100708,
907
+ "rewards/margins": 1.7921463251113892,
908
+ "rewards/rejected": -4.37519645690918,
909
  "step": 295
910
  },
911
  {
912
+ "epoch": 0.6332871561448644,
913
+ "grad_norm": 17.780553626895177,
914
  "learning_rate": 1.7803695924219814e-07,
915
+ "logits/chosen": -1.4669979810714722,
916
+ "logits/rejected": -1.4254872798919678,
917
+ "logps/chosen": -584.629150390625,
918
+ "logps/rejected": -784.3005981445312,
919
+ "loss": 0.4051,
920
+ "rewards/accuracies": 0.831250011920929,
921
+ "rewards/chosen": -3.1204633712768555,
922
+ "rewards/margins": 1.9275195598602295,
923
+ "rewards/rejected": -5.047983169555664,
924
  "step": 300
925
  },
926
  {
927
+ "epoch": 0.6438419420806122,
928
+ "grad_norm": 19.866976638480853,
929
  "learning_rate": 1.6923922519515067e-07,
930
+ "logits/chosen": -1.4443576335906982,
931
+ "logits/rejected": -1.4355580806732178,
932
+ "logps/chosen": -523.0612182617188,
933
+ "logps/rejected": -729.1703491210938,
934
+ "loss": 0.4069,
935
+ "rewards/accuracies": 0.84375,
936
+ "rewards/chosen": -2.749462842941284,
937
+ "rewards/margins": 1.8047136068344116,
938
+ "rewards/rejected": -4.554176330566406,
939
  "step": 305
940
  },
941
  {
942
+ "epoch": 0.65439672801636,
943
+ "grad_norm": 19.42045257459303,
944
  "learning_rate": 1.605518006520924e-07,
945
+ "logits/chosen": -1.4894784688949585,
946
+ "logits/rejected": -1.4559067487716675,
947
+ "logps/chosen": -589.7485961914062,
948
+ "logps/rejected": -808.5824584960938,
949
+ "loss": 0.3816,
950
+ "rewards/accuracies": 0.824999988079071,
951
+ "rewards/chosen": -3.1640734672546387,
952
+ "rewards/margins": 2.0186519622802734,
953
+ "rewards/rejected": -5.182725429534912,
954
  "step": 310
955
  },
956
  {
957
+ "epoch": 0.6649515139521076,
958
+ "grad_norm": 34.470860837004935,
959
  "learning_rate": 1.519865515899731e-07,
960
+ "logits/chosen": -1.5242574214935303,
961
+ "logits/rejected": -1.3928359746932983,
962
+ "logps/chosen": -588.0692138671875,
963
+ "logps/rejected": -841.7396240234375,
964
+ "loss": 0.3858,
965
+ "rewards/accuracies": 0.8125,
966
+ "rewards/chosen": -3.3201167583465576,
967
+ "rewards/margins": 2.1101181507110596,
968
+ "rewards/rejected": -5.430234432220459,
969
  "step": 315
970
  },
971
  {
972
+ "epoch": 0.6755062998878554,
973
+ "grad_norm": 30.007742834102533,
974
  "learning_rate": 1.4355517710873182e-07,
975
+ "logits/chosen": -1.546661615371704,
976
+ "logits/rejected": -1.4665791988372803,
977
+ "logps/chosen": -627.7368774414062,
978
+ "logps/rejected": -869.39453125,
979
+ "loss": 0.3914,
980
+ "rewards/accuracies": 0.8500000238418579,
981
+ "rewards/chosen": -3.6285042762756348,
982
+ "rewards/margins": 2.2051749229431152,
983
+ "rewards/rejected": -5.83367919921875,
984
  "step": 320
985
  },
986
  {
987
+ "epoch": 0.6860610858236031,
988
+ "grad_norm": 34.25630759725382,
989
  "learning_rate": 1.3526919345173318e-07,
990
+ "logits/chosen": -1.5128138065338135,
991
+ "logits/rejected": -1.4411523342132568,
992
+ "logps/chosen": -602.6571044921875,
993
+ "logps/rejected": -849.2667846679688,
994
+ "loss": 0.4211,
995
  "rewards/accuracies": 0.84375,
996
+ "rewards/chosen": -3.323326826095581,
997
+ "rewards/margins": 2.333648681640625,
998
+ "rewards/rejected": -5.656975269317627,
999
  "step": 325
1000
  },
1001
  {
1002
+ "epoch": 0.6966158717593509,
1003
+ "grad_norm": 28.233731363069086,
1004
  "learning_rate": 1.2713991827596443e-07,
1005
+ "logits/chosen": -1.495444655418396,
1006
+ "logits/rejected": -1.4902544021606445,
1007
+ "logps/chosen": -605.8360595703125,
1008
+ "logps/rejected": -796.5411376953125,
1009
+ "loss": 0.3791,
1010
+ "rewards/accuracies": 0.8187500238418579,
1011
+ "rewards/chosen": -3.165205478668213,
1012
+ "rewards/margins": 1.914367914199829,
1013
+ "rewards/rejected": -5.079573631286621,
1014
  "step": 330
1015
  },
1016
  {
1017
+ "epoch": 0.7071706576950986,
1018
+ "grad_norm": 28.532009996830972,
1019
  "learning_rate": 1.191784551934773e-07,
1020
+ "logits/chosen": -1.5311321020126343,
1021
+ "logits/rejected": -1.444746971130371,
1022
+ "logps/chosen": -515.9117431640625,
1023
+ "logps/rejected": -714.9118041992188,
1024
+ "loss": 0.3985,
1025
+ "rewards/accuracies": 0.862500011920929,
1026
+ "rewards/chosen": -2.741084575653076,
1027
+ "rewards/margins": 1.8353731632232666,
1028
+ "rewards/rejected": -4.5764570236206055,
1029
  "step": 335
1030
  },
1031
  {
1032
+ "epoch": 0.7177254436308463,
1033
+ "grad_norm": 34.722496076575624,
1034
  "learning_rate": 1.1139567860518953e-07,
1035
+ "logits/chosen": -1.3724250793457031,
1036
+ "logits/rejected": -1.3445093631744385,
1037
+ "logps/chosen": -516.8666381835938,
1038
+ "logps/rejected": -690.0352172851562,
1039
+ "loss": 0.4182,
1040
+ "rewards/accuracies": 0.800000011920929,
1041
+ "rewards/chosen": -2.5948445796966553,
1042
+ "rewards/margins": 1.631908655166626,
1043
+ "rewards/rejected": -4.226753234863281,
1044
  "step": 340
1045
  },
1046
  {
1047
+ "epoch": 0.7282802295665941,
1048
+ "grad_norm": 29.539595863207374,
1049
  "learning_rate": 1.0380221884776128e-07,
1050
+ "logits/chosen": -1.4173920154571533,
1051
+ "logits/rejected": -1.3776549100875854,
1052
+ "logps/chosen": -542.7964477539062,
1053
+ "logps/rejected": -722.7305297851562,
1054
+ "loss": 0.4204,
1055
+ "rewards/accuracies": 0.856249988079071,
1056
+ "rewards/chosen": -2.7417469024658203,
1057
+ "rewards/margins": 1.6534277200698853,
1058
+ "rewards/rejected": -4.395174980163574,
1059
  "step": 345
1060
  },
1061
  {
1062
+ "epoch": 0.7388350155023419,
1063
+ "grad_norm": 17.47998143760053,
1064
  "learning_rate": 9.640844767383405e-08,
1065
+ "logits/chosen": -1.3427600860595703,
1066
+ "logits/rejected": -1.2479599714279175,
1067
+ "logps/chosen": -540.9366455078125,
1068
+ "logps/rejected": -741.2656860351562,
1069
+ "loss": 0.4206,
1070
+ "rewards/accuracies": 0.831250011920929,
1071
+ "rewards/chosen": -2.7238881587982178,
1072
+ "rewards/margins": 1.8479011058807373,
1073
+ "rewards/rejected": -4.571789741516113,
1074
  "step": 350
1075
  },
1076
  {
1077
+ "epoch": 0.7493898014380895,
1078
+ "grad_norm": 22.53665974937468,
1079
  "learning_rate": 8.922446408546378e-08,
1080
+ "logits/chosen": -1.3614610433578491,
1081
+ "logits/rejected": -1.3304545879364014,
1082
+ "logps/chosen": -564.0736694335938,
1083
+ "logps/rejected": -796.0197143554688,
1084
+ "loss": 0.4234,
1085
+ "rewards/accuracies": 0.8125,
1086
+ "rewards/chosen": -2.8989734649658203,
1087
+ "rewards/margins": 2.1152186393737793,
1088
+ "rewards/rejected": -5.0141921043396,
1089
  "step": 355
1090
  },
1091
  {
1092
+ "epoch": 0.7599445873738373,
1093
+ "grad_norm": 34.016623930856696,
1094
  "learning_rate": 8.22600805400994e-08,
1095
+ "logits/chosen": -1.4026951789855957,
1096
+ "logits/rejected": -1.3086416721343994,
1097
+ "logps/chosen": -557.1563720703125,
1098
+ "logps/rejected": -775.2821044921875,
1099
+ "loss": 0.3918,
1100
+ "rewards/accuracies": 0.8687499761581421,
1101
+ "rewards/chosen": -2.755949020385742,
1102
+ "rewards/margins": 1.9373924732208252,
1103
+ "rewards/rejected": -4.6933417320251465,
1104
  "step": 360
1105
  },
1106
  {
1107
+ "epoch": 0.7704993733095851,
1108
+ "grad_norm": 19.583973928513963,
1109
  "learning_rate": 7.552480954794558e-08,
1110
+ "logits/chosen": -1.4313266277313232,
1111
+ "logits/rejected": -1.3103513717651367,
1112
+ "logps/chosen": -564.1690673828125,
1113
+ "logps/rejected": -756.8692626953125,
1114
+ "loss": 0.4048,
1115
+ "rewards/accuracies": 0.7875000238418579,
1116
+ "rewards/chosen": -2.88138484954834,
1117
+ "rewards/margins": 1.8533366918563843,
1118
+ "rewards/rejected": -4.7347211837768555,
1119
  "step": 365
1120
  },
1121
  {
1122
+ "epoch": 0.7810541592453328,
1123
+ "grad_norm": 18.206536925706505,
1124
  "learning_rate": 6.902785067901854e-08,
1125
+ "logits/chosen": -1.3645613193511963,
1126
+ "logits/rejected": -1.3341350555419922,
1127
+ "logps/chosen": -579.579833984375,
1128
+ "logps/rejected": -779.4511108398438,
1129
+ "loss": 0.3792,
1130
+ "rewards/accuracies": 0.8187500238418579,
1131
+ "rewards/chosen": -3.0597000122070312,
1132
+ "rewards/margins": 1.8546708822250366,
1133
+ "rewards/rejected": -4.914370536804199,
1134
  "step": 370
1135
  },
1136
  {
1137
+ "epoch": 0.7916089451810805,
1138
+ "grad_norm": 25.99340540536939,
1139
  "learning_rate": 6.277807799763973e-08,
1140
+ "logits/chosen": -1.5334607362747192,
1141
+ "logits/rejected": -1.4499049186706543,
1142
+ "logps/chosen": -563.6980590820312,
1143
+ "logps/rejected": -789.72119140625,
1144
+ "loss": 0.3903,
1145
+ "rewards/accuracies": 0.8687499761581421,
1146
+ "rewards/chosen": -3.1820693016052246,
1147
+ "rewards/margins": 1.9633289575576782,
1148
+ "rewards/rejected": -5.145398139953613,
1149
  "step": 375
1150
  },
1151
  {
1152
+ "epoch": 0.8021637311168283,
1153
+ "grad_norm": 27.363349380647865,
1154
  "learning_rate": 5.678402794153145e-08,
1155
+ "logits/chosen": -1.490678071975708,
1156
+ "logits/rejected": -1.4118311405181885,
1157
+ "logps/chosen": -635.3906860351562,
1158
+ "logps/rejected": -841.986328125,
1159
+ "loss": 0.3978,
1160
+ "rewards/accuracies": 0.824999988079071,
1161
+ "rewards/chosen": -3.6473312377929688,
1162
+ "rewards/margins": 1.8795642852783203,
1163
+ "rewards/rejected": -5.526895523071289,
1164
  "step": 380
1165
  },
1166
  {
1167
+ "epoch": 0.812718517052576,
1168
+ "grad_norm": 22.024184023637442,
1169
  "learning_rate": 5.105388766206969e-08,
1170
+ "logits/chosen": -1.6372134685516357,
1171
+ "logits/rejected": -1.4456651210784912,
1172
+ "logps/chosen": -670.1497802734375,
1173
+ "logps/rejected": -900.1561279296875,
1174
+ "loss": 0.4107,
1175
+ "rewards/accuracies": 0.84375,
1176
+ "rewards/chosen": -3.8800835609436035,
1177
+ "rewards/margins": 2.1356186866760254,
1178
+ "rewards/rejected": -6.015702724456787,
1179
  "step": 385
1180
  },
1181
  {
1182
+ "epoch": 0.8232733029883238,
1183
+ "grad_norm": 18.957756630133876,
1184
  "learning_rate": 4.5595483841620484e-08,
1185
+ "logits/chosen": -1.4949450492858887,
1186
+ "logits/rejected": -1.466933250427246,
1187
+ "logps/chosen": -592.1412963867188,
1188
+ "logps/rejected": -814.1785888671875,
1189
+ "loss": 0.3793,
1190
+ "rewards/accuracies": 0.8374999761581421,
1191
+ "rewards/chosen": -3.3024775981903076,
1192
+ "rewards/margins": 1.9800523519515991,
1193
+ "rewards/rejected": -5.282530307769775,
1194
  "step": 390
1195
  },
1196
  {
1197
+ "epoch": 0.8338280889240715,
1198
+ "grad_norm": 24.130033467348618,
1199
  "learning_rate": 4.0416272003232526e-08,
1200
+ "logits/chosen": -1.5286778211593628,
1201
+ "logits/rejected": -1.4018694162368774,
1202
+ "logps/chosen": -614.2500610351562,
1203
+ "logps/rejected": -848.65673828125,
1204
+ "loss": 0.4233,
1205
  "rewards/accuracies": 0.856249988079071,
1206
+ "rewards/chosen": -3.3793911933898926,
1207
+ "rewards/margins": 2.2501933574676514,
1208
+ "rewards/rejected": -5.629584312438965,
1209
  "step": 395
1210
  },
1211
  {
1212
+ "epoch": 0.8443828748598192,
1213
+ "grad_norm": 21.924780448040742,
1214
  "learning_rate": 3.552332632729041e-08,
1215
+ "logits/chosen": -1.3315622806549072,
1216
+ "logits/rejected": -1.4189374446868896,
1217
+ "logps/chosen": -591.8238525390625,
1218
+ "logps/rejected": -745.0551147460938,
1219
+ "loss": 0.4269,
1220
+ "rewards/accuracies": 0.7749999761581421,
1221
+ "rewards/chosen": -3.1886703968048096,
1222
+ "rewards/margins": 1.4928715229034424,
1223
+ "rewards/rejected": -4.68154239654541,
1224
  "step": 400
1225
  },
1226
  {
1227
+ "epoch": 0.8443828748598192,
1228
+ "eval_logits/chosen": -1.3921160697937012,
1229
+ "eval_logits/rejected": -1.360321283340454,
1230
+ "eval_logps/chosen": -602.09814453125,
1231
+ "eval_logps/rejected": -797.6878051757812,
1232
+ "eval_loss": 0.38372838497161865,
1233
+ "eval_rewards/accuracies": 0.8643724918365479,
1234
+ "eval_rewards/chosen": -3.2510859966278076,
1235
+ "eval_rewards/margins": 1.8691294193267822,
1236
+ "eval_rewards/rejected": -5.120214939117432,
1237
+ "eval_runtime": 313.5439,
1238
+ "eval_samples_per_second": 6.302,
1239
+ "eval_steps_per_second": 1.576,
1240
  "step": 400
1241
  },
1242
  {
1243
+ "epoch": 0.854937660795567,
1244
+ "grad_norm": 19.316578134763,
1245
  "learning_rate": 3.092332998903416e-08,
1246
+ "logits/chosen": -1.4394410848617554,
1247
+ "logits/rejected": -1.3752410411834717,
1248
+ "logps/chosen": -601.6661987304688,
1249
+ "logps/rejected": -783.7593994140625,
1250
+ "loss": 0.3863,
1251
+ "rewards/accuracies": 0.8187500238418579,
1252
+ "rewards/chosen": -3.3254780769348145,
1253
+ "rewards/margins": 1.7059745788574219,
1254
+ "rewards/rejected": -5.031452655792236,
1255
  "step": 405
1256
  },
1257
  {
1258
+ "epoch": 0.8654924467313148,
1259
+ "grad_norm": 28.757209433347523,
1260
  "learning_rate": 2.6622566030146455e-08,
1261
+ "logits/chosen": -1.4320390224456787,
1262
+ "logits/rejected": -1.413570523262024,
1263
+ "logps/chosen": -565.2530517578125,
1264
+ "logps/rejected": -754.3922729492188,
1265
+ "loss": 0.4213,
1266
+ "rewards/accuracies": 0.8500000238418579,
1267
+ "rewards/chosen": -2.987544298171997,
1268
+ "rewards/margins": 1.7588831186294556,
1269
+ "rewards/rejected": -4.7464280128479,
1270
  "step": 410
1271
  },
1272
  {
1273
+ "epoch": 0.8760472326670625,
1274
+ "grad_norm": 18.43535305501894,
1275
  "learning_rate": 2.26269087768734e-08,
1276
+ "logits/chosen": -1.4583203792572021,
1277
+ "logits/rejected": -1.3761074542999268,
1278
+ "logps/chosen": -577.720703125,
1279
+ "logps/rejected": -806.8515014648438,
1280
+ "loss": 0.3829,
1281
+ "rewards/accuracies": 0.8500000238418579,
1282
+ "rewards/chosen": -3.2120673656463623,
1283
+ "rewards/margins": 2.084867477416992,
1284
+ "rewards/rejected": -5.296935081481934,
1285
  "step": 415
1286
  },
1287
  {
1288
+ "epoch": 0.8866020186028102,
1289
+ "grad_norm": 20.96371705639774,
1290
  "learning_rate": 1.894181581640106e-08,
1291
+ "logits/chosen": -1.384445071220398,
1292
+ "logits/rejected": -1.4241831302642822,
1293
+ "logps/chosen": -569.5962524414062,
1294
+ "logps/rejected": -749.0470581054688,
1295
+ "loss": 0.3819,
1296
+ "rewards/accuracies": 0.856249988079071,
1297
+ "rewards/chosen": -3.081413745880127,
1298
+ "rewards/margins": 1.7126514911651611,
1299
+ "rewards/rejected": -4.794064998626709,
1300
  "step": 420
1301
  },
1302
  {
1303
+ "epoch": 0.897156804538558,
1304
+ "grad_norm": 26.21197116684321,
1305
  "learning_rate": 1.5572320542448143e-08,
1306
+ "logits/chosen": -1.3703296184539795,
1307
+ "logits/rejected": -1.3327410221099854,
1308
+ "logps/chosen": -617.3011474609375,
1309
+ "logps/rejected": -836.2391357421875,
1310
+ "loss": 0.3825,
1311
+ "rewards/accuracies": 0.875,
1312
+ "rewards/chosen": -3.3691182136535645,
1313
+ "rewards/margins": 2.073474407196045,
1314
+ "rewards/rejected": -5.442592620849609,
1315
  "step": 425
1316
  },
1317
  {
1318
+ "epoch": 0.9077115904743057,
1319
+ "grad_norm": 22.75331582162892,
1320
  "learning_rate": 1.2523025280255729e-08,
1321
+ "logits/chosen": -1.4123306274414062,
1322
+ "logits/rejected": -1.4244548082351685,
1323
+ "logps/chosen": -593.9744262695312,
1324
+ "logps/rejected": -802.8662719726562,
1325
+ "loss": 0.3422,
1326
+ "rewards/accuracies": 0.8687499761581421,
1327
+ "rewards/chosen": -3.253312349319458,
1328
+ "rewards/margins": 1.9519180059432983,
1329
+ "rewards/rejected": -5.205230712890625,
1330
  "step": 430
1331
  },
1332
  {
1333
+ "epoch": 0.9182663764100535,
1334
+ "grad_norm": 17.924586145441417,
1335
  "learning_rate": 9.798095000364214e-09,
1336
+ "logits/chosen": -1.5641670227050781,
1337
+ "logits/rejected": -1.4084519147872925,
1338
+ "logps/chosen": -600.7659912109375,
1339
+ "logps/rejected": -869.5779418945312,
1340
+ "loss": 0.3553,
1341
+ "rewards/accuracies": 0.8812500238418579,
1342
+ "rewards/chosen": -3.2969956398010254,
1343
+ "rewards/margins": 2.371945381164551,
1344
+ "rewards/rejected": -5.668941020965576,
1345
  "step": 435
1346
  },
1347
  {
1348
+ "epoch": 0.9288211623458011,
1349
+ "grad_norm": 25.268001512531786,
1350
  "learning_rate": 7.401251629764876e-09,
1351
+ "logits/chosen": -1.5503554344177246,
1352
+ "logits/rejected": -1.3530725240707397,
1353
+ "logps/chosen": -560.983154296875,
1354
+ "logps/rejected": -772.5062255859375,
1355
+ "loss": 0.4149,
1356
+ "rewards/accuracies": 0.8374999761581421,
1357
+ "rewards/chosen": -3.1528074741363525,
1358
+ "rewards/margins": 1.8247524499893188,
1359
+ "rewards/rejected": -4.977559566497803,
1360
  "step": 440
1361
  },
1362
  {
1363
+ "epoch": 0.9393759482815489,
1364
+ "grad_norm": 18.848350951492684,
1365
  "learning_rate": 5.335768968195098e-09,
1366
+ "logits/chosen": -1.4744082689285278,
1367
+ "logits/rejected": -1.370416283607483,
1368
+ "logps/chosen": -589.2518920898438,
1369
+ "logps/rejected": -806.16943359375,
1370
+ "loss": 0.4069,
1371
+ "rewards/accuracies": 0.856249988079071,
1372
+ "rewards/chosen": -3.1618690490722656,
1373
+ "rewards/margins": 2.018306255340576,
1374
+ "rewards/rejected": -5.180174827575684,
1375
  "step": 445
1376
  },
1377
  {
1378
+ "epoch": 0.9499307342172967,
1379
+ "grad_norm": 14.44527353219914,
1380
  "learning_rate": 3.604468216521883e-09,
1381
+ "logits/chosen": -1.3630057573318481,
1382
+ "logits/rejected": -1.2414896488189697,
1383
+ "logps/chosen": -591.0068359375,
1384
+ "logps/rejected": -788.7886352539062,
1385
+ "loss": 0.371,
1386
+ "rewards/accuracies": 0.875,
1387
+ "rewards/chosen": -3.234240770339966,
1388
+ "rewards/margins": 1.8283990621566772,
1389
+ "rewards/rejected": -5.0626397132873535,
1390
  "step": 450
1391
  },
1392
  {
1393
+ "epoch": 0.9604855201530444,
1394
+ "grad_norm": 17.894278742200918,
1395
  "learning_rate": 2.2097141233206884e-09,
1396
+ "logits/chosen": -1.5407589673995972,
1397
+ "logits/rejected": -1.4432313442230225,
1398
+ "logps/chosen": -616.9603271484375,
1399
+ "logps/rejected": -858.2979736328125,
1400
+ "loss": 0.3764,
1401
+ "rewards/accuracies": 0.856249988079071,
1402
+ "rewards/chosen": -3.5265698432922363,
1403
+ "rewards/margins": 2.2084403038024902,
1404
+ "rewards/rejected": -5.735010623931885,
1405
  "step": 455
1406
  },
1407
  {
1408
+ "epoch": 0.9710403060887921,
1409
+ "grad_norm": 18.751457793725724,
1410
  "learning_rate": 1.1534117549133472e-09,
1411
+ "logits/chosen": -1.46907639503479,
1412
+ "logits/rejected": -1.3426740169525146,
1413
+ "logps/chosen": -629.7129516601562,
1414
+ "logps/rejected": -850.1477661132812,
1415
+ "loss": 0.3512,
1416
+ "rewards/accuracies": 0.856249988079071,
1417
+ "rewards/chosen": -3.58038330078125,
1418
+ "rewards/margins": 2.005390167236328,
1419
+ "rewards/rejected": -5.585773468017578,
1420
  "step": 460
1421
  },
1422
  {
1423
+ "epoch": 0.9815950920245399,
1424
+ "grad_norm": 29.39806840137654,
1425
  "learning_rate": 4.3700389327672173e-10,
1426
+ "logits/chosen": -1.469868779182434,
1427
+ "logits/rejected": -1.3952000141143799,
1428
+ "logps/chosen": -604.9054565429688,
1429
+ "logps/rejected": -832.4953002929688,
1430
+ "loss": 0.3768,
1431
+ "rewards/accuracies": 0.8687499761581421,
1432
+ "rewards/chosen": -3.387165069580078,
1433
+ "rewards/margins": 2.0868310928344727,
1434
+ "rewards/rejected": -5.473996162414551,
1435
  "step": 465
1436
  },
1437
  {
1438
+ "epoch": 0.9921498779602876,
1439
+ "grad_norm": 22.359148428267968,
1440
  "learning_rate": 6.146906537587982e-11,
1441
+ "logits/chosen": -1.4432518482208252,
1442
+ "logits/rejected": -1.4169013500213623,
1443
+ "logps/chosen": -600.5186157226562,
1444
+ "logps/rejected": -811.341796875,
1445
+ "loss": 0.3939,
1446
+ "rewards/accuracies": 0.8062499761581421,
1447
+ "rewards/chosen": -3.3816261291503906,
1448
+ "rewards/margins": 2.0300629138946533,
1449
+ "rewards/rejected": -5.411688804626465,
1450
  "step": 470
1451
  },
1452
  {
1453
+ "epoch": 0.9984827495217362,
1454
  "step": 473,
1455
  "total_flos": 0.0,
1456
+ "train_loss": 0.4645486564767285,
1457
+ "train_runtime": 24049.7915,
1458
+ "train_samples_per_second": 2.521,
1459
+ "train_steps_per_second": 0.02
1460
  }
1461
  ],
1462
  "logging_steps": 5,
 
1477
  }
1478
  },
1479
  "total_flos": 0.0,
1480
+ "train_batch_size": 1,
1481
  "trial_name": null,
1482
  "trial_params": null
1483
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c1f5e554cceb99bd6f337e8a5aed371111e7e9f9e7e6a430c61cfbd978575d48
3
  size 7544
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d1448ab9c2dd3a69fc51142481a50485c35957226f08c9032026a3fb5687e76
3
  size 7544