li-muyang commited on
Commit
e6d0960
·
verified ·
1 Parent(s): 4c946aa

Model save

Browse files
Files changed (4) hide show
  1. README.md +10 -10
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +487 -487
README.md CHANGED
@@ -16,15 +16,15 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.5260
20
- - Rewards/chosen: -0.6381
21
- - Rewards/rejected: -1.4215
22
- - Rewards/accuracies: 0.7773
23
- - Rewards/margins: 0.7834
24
- - Logps/rejected: -409.2955
25
- - Logps/chosen: -334.1724
26
- - Logits/rejected: -0.8835
27
- - Logits/chosen: -1.0303
28
 
29
  ## Model description
30
 
@@ -61,7 +61,7 @@ The following hyperparameters were used during training:
61
 
62
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
- | 0.5174 | 0.9984 | 477 | 0.5260 | -0.6381 | -1.4215 | 0.7773 | 0.7834 | -409.2955 | -334.1724 | -0.8835 | -1.0303 |
65
 
66
 
67
  ### Framework versions
 
16
 
17
  This model was trained from scratch on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
+ - Loss: 0.5279
20
+ - Rewards/chosen: -0.6819
21
+ - Rewards/rejected: -1.4900
22
+ - Rewards/accuracies: 0.7812
23
+ - Rewards/margins: 0.8081
24
+ - Logps/rejected: -425.1121
25
+ - Logps/chosen: -348.1232
26
+ - Logits/rejected: -1.3790
27
+ - Logits/chosen: -1.4815
28
 
29
  ## Model description
30
 
 
61
 
62
  | Training Loss | Epoch | Step | Validation Loss | Rewards/chosen | Rewards/rejected | Rewards/accuracies | Rewards/margins | Logps/rejected | Logps/chosen | Logits/rejected | Logits/chosen |
63
  |:-------------:|:------:|:----:|:---------------:|:--------------:|:----------------:|:------------------:|:---------------:|:--------------:|:------------:|:---------------:|:-------------:|
64
+ | 0.5147 | 0.9984 | 477 | 0.5279 | -0.6819 | -1.4900 | 0.7812 | 0.8081 | -425.1121 | -348.1232 | -1.3790 | -1.4815 |
65
 
66
 
67
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5633771029658288,
5
- "train_runtime": 15455.4899,
6
  "train_samples": 61134,
7
- "train_samples_per_second": 3.955,
8
- "train_steps_per_second": 0.031
9
  }
 
1
  {
2
  "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5669088098737929,
5
+ "train_runtime": 15771.2037,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 3.876,
8
+ "train_steps_per_second": 0.03
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
- "train_loss": 0.5633771029658288,
5
- "train_runtime": 15455.4899,
6
  "train_samples": 61134,
7
- "train_samples_per_second": 3.955,
8
- "train_steps_per_second": 0.031
9
  }
 
1
  {
2
  "epoch": 0.9984301412872841,
3
  "total_flos": 0.0,
4
+ "train_loss": 0.5669088098737929,
5
+ "train_runtime": 15771.2037,
6
  "train_samples": 61134,
7
+ "train_samples_per_second": 3.876,
8
+ "train_steps_per_second": 0.03
9
  }
trainer_state.json CHANGED
@@ -10,12 +10,12 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0020931449502878076,
13
- "grad_norm": 11.915830605029264,
14
  "learning_rate": 1.0416666666666666e-08,
15
- "logits/chosen": -2.900132894515991,
16
- "logits/rejected": -2.834955930709839,
17
- "logps/chosen": -317.546875,
18
- "logps/rejected": -362.03985595703125,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
@@ -25,733 +25,733 @@
25
  },
26
  {
27
  "epoch": 0.020931449502878074,
28
- "grad_norm": 9.037023118445425,
29
  "learning_rate": 1.0416666666666667e-07,
30
- "logits/chosen": -2.636439800262451,
31
- "logits/rejected": -2.5899064540863037,
32
- "logps/chosen": -314.6423645019531,
33
- "logps/rejected": -281.7502136230469,
34
- "loss": 0.6931,
35
- "rewards/accuracies": 0.5,
36
- "rewards/chosen": 0.0006509354570880532,
37
- "rewards/margins": 0.00030382387922145426,
38
- "rewards/rejected": 0.0003471115487627685,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04186289900575615,
43
- "grad_norm": 8.350785008747597,
44
  "learning_rate": 2.0833333333333333e-07,
45
- "logits/chosen": -2.7466042041778564,
46
- "logits/rejected": -2.6606907844543457,
47
- "logps/chosen": -315.2346496582031,
48
- "logps/rejected": -285.74896240234375,
49
- "loss": 0.6927,
50
- "rewards/accuracies": 0.550000011920929,
51
- "rewards/chosen": 0.0006443248712457716,
52
- "rewards/margins": 0.00116971624083817,
53
- "rewards/rejected": -0.0005253913695923984,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06279434850863422,
58
- "grad_norm": 7.732991459064897,
59
  "learning_rate": 3.1249999999999997e-07,
60
- "logits/chosen": -2.71269154548645,
61
- "logits/rejected": -2.614644765853882,
62
- "logps/chosen": -297.6011657714844,
63
- "logps/rejected": -253.3177947998047,
64
- "loss": 0.6908,
65
  "rewards/accuracies": 0.625,
66
- "rewards/chosen": 0.005987043492496014,
67
- "rewards/margins": 0.005661585368216038,
68
- "rewards/rejected": 0.0003254577750340104,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.0837257980115123,
73
- "grad_norm": 7.653673914009054,
74
  "learning_rate": 4.1666666666666667e-07,
75
- "logits/chosen": -2.600621461868286,
76
- "logits/rejected": -2.5460174083709717,
77
- "logps/chosen": -279.81024169921875,
78
- "logps/rejected": -266.014404296875,
79
- "loss": 0.685,
80
- "rewards/accuracies": 0.6875,
81
- "rewards/chosen": 0.00993821956217289,
82
- "rewards/margins": 0.01584107242524624,
83
- "rewards/rejected": -0.005902853794395924,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.10465724751439037,
88
- "grad_norm": 8.353830361973774,
89
  "learning_rate": 4.999731868769026e-07,
90
- "logits/chosen": -2.632459878921509,
91
- "logits/rejected": -2.5336594581604004,
92
- "logps/chosen": -282.0531005859375,
93
- "logps/rejected": -278.79656982421875,
94
- "loss": 0.6753,
95
- "rewards/accuracies": 0.71875,
96
- "rewards/chosen": 0.01642102375626564,
97
- "rewards/margins": 0.034309130162000656,
98
- "rewards/rejected": -0.017888108268380165,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.12558869701726844,
103
- "grad_norm": 9.167056957625269,
104
  "learning_rate": 4.990353313429303e-07,
105
- "logits/chosen": -2.657794237136841,
106
- "logits/rejected": -2.5765810012817383,
107
- "logps/chosen": -259.9486389160156,
108
- "logps/rejected": -256.295166015625,
109
- "loss": 0.663,
110
- "rewards/accuracies": 0.731249988079071,
111
- "rewards/chosen": 0.02387804351747036,
112
- "rewards/margins": 0.08073899894952774,
113
- "rewards/rejected": -0.056860946118831635,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.14652014652014653,
118
- "grad_norm": 8.724078011965831,
119
  "learning_rate": 4.967625656594781e-07,
120
- "logits/chosen": -2.545210599899292,
121
- "logits/rejected": -2.499584913253784,
122
- "logps/chosen": -307.4738464355469,
123
- "logps/rejected": -303.07073974609375,
124
- "loss": 0.6418,
125
- "rewards/accuracies": 0.6937500238418579,
126
- "rewards/chosen": -0.054794955998659134,
127
- "rewards/margins": 0.11737842857837677,
128
- "rewards/rejected": -0.1721733808517456,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.1674515960230246,
133
- "grad_norm": 11.143931467386217,
134
  "learning_rate": 4.93167072587771e-07,
135
- "logits/chosen": -2.6561365127563477,
136
- "logits/rejected": -2.5057692527770996,
137
- "logps/chosen": -350.4751892089844,
138
- "logps/rejected": -290.5388488769531,
139
- "loss": 0.6268,
140
- "rewards/accuracies": 0.7437499761581421,
141
- "rewards/chosen": -0.09161636233329773,
142
- "rewards/margins": 0.19298198819160461,
143
- "rewards/rejected": -0.28459829092025757,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.18838304552590268,
148
- "grad_norm": 12.494281610138923,
149
  "learning_rate": 4.882681251368548e-07,
150
- "logits/chosen": -2.5815837383270264,
151
- "logits/rejected": -2.5117011070251465,
152
- "logps/chosen": -282.310791015625,
153
- "logps/rejected": -313.5682067871094,
154
- "loss": 0.6067,
155
- "rewards/accuracies": 0.675000011920929,
156
- "rewards/chosen": -0.22465059161186218,
157
- "rewards/margins": 0.21727819740772247,
158
- "rewards/rejected": -0.44192880392074585,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.20931449502878074,
163
- "grad_norm": 11.897397709020806,
164
  "learning_rate": 4.820919832540181e-07,
165
- "logits/chosen": -2.59562611579895,
166
- "logits/rejected": -2.5282070636749268,
167
- "logps/chosen": -338.21673583984375,
168
- "logps/rejected": -348.1116027832031,
169
- "loss": 0.6072,
170
- "rewards/accuracies": 0.71875,
171
- "rewards/chosen": -0.31439146399497986,
172
- "rewards/margins": 0.3437823951244354,
173
- "rewards/rejected": -0.6581738591194153,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2302459445316588,
178
- "grad_norm": 16.76442489393267,
179
  "learning_rate": 4.7467175306295647e-07,
180
- "logits/chosen": -2.593721866607666,
181
- "logits/rejected": -2.5085806846618652,
182
- "logps/chosen": -343.6995849609375,
183
- "logps/rejected": -360.6001892089844,
184
- "loss": 0.6009,
185
- "rewards/accuracies": 0.637499988079071,
186
- "rewards/chosen": -0.46031326055526733,
187
- "rewards/margins": 0.33665376901626587,
188
- "rewards/rejected": -0.796967089176178,
189
  "step": 110
190
  },
191
  {
192
  "epoch": 0.25117739403453687,
193
- "grad_norm": 12.98951659559394,
194
  "learning_rate": 4.6604720940421207e-07,
195
- "logits/chosen": -2.31950044631958,
196
- "logits/rejected": -2.2902190685272217,
197
- "logps/chosen": -332.3330993652344,
198
- "logps/rejected": -360.6295166015625,
199
- "loss": 0.579,
200
- "rewards/accuracies": 0.7562500238418579,
201
- "rewards/chosen": -0.5197553634643555,
202
- "rewards/margins": 0.4039463996887207,
203
- "rewards/rejected": -0.9237018823623657,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 0.272108843537415,
208
- "grad_norm": 13.545649928916575,
209
  "learning_rate": 4.5626458262912735e-07,
210
- "logits/chosen": -2.247131824493408,
211
- "logits/rejected": -2.171808958053589,
212
- "logps/chosen": -331.72137451171875,
213
- "logps/rejected": -355.0787658691406,
214
- "loss": 0.5595,
215
- "rewards/accuracies": 0.7250000238418579,
216
- "rewards/chosen": -0.3730407953262329,
217
- "rewards/margins": 0.37541478872299194,
218
- "rewards/rejected": -0.7484556436538696,
219
  "step": 130
220
  },
221
  {
222
  "epoch": 0.29304029304029305,
223
- "grad_norm": 20.162985308235303,
224
  "learning_rate": 4.453763107901675e-07,
225
- "logits/chosen": -2.0727458000183105,
226
- "logits/rejected": -1.917345404624939,
227
- "logps/chosen": -367.02081298828125,
228
- "logps/rejected": -371.6455993652344,
229
- "loss": 0.5624,
230
  "rewards/accuracies": 0.7562500238418579,
231
- "rewards/chosen": -0.41769176721572876,
232
- "rewards/margins": 0.5499740839004517,
233
- "rewards/rejected": -0.9676656723022461,
234
  "step": 140
235
  },
236
  {
237
  "epoch": 0.3139717425431711,
238
- "grad_norm": 19.0565927001832,
239
  "learning_rate": 4.3344075855595097e-07,
240
- "logits/chosen": -1.912172555923462,
241
- "logits/rejected": -1.765091896057129,
242
- "logps/chosen": -339.38177490234375,
243
- "logps/rejected": -350.7415771484375,
244
- "loss": 0.5677,
245
- "rewards/accuracies": 0.675000011920929,
246
- "rewards/chosen": -0.49130716919898987,
247
- "rewards/margins": 0.4586152136325836,
248
- "rewards/rejected": -0.9499223828315735,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.3349031920460492,
253
- "grad_norm": 15.74857851317074,
254
  "learning_rate": 4.2052190435769554e-07,
255
- "logits/chosen": -1.90102219581604,
256
- "logits/rejected": -1.661292314529419,
257
- "logps/chosen": -325.9547424316406,
258
- "logps/rejected": -354.97247314453125,
259
- "loss": 0.5665,
260
- "rewards/accuracies": 0.71875,
261
- "rewards/chosen": -0.4339516758918762,
262
- "rewards/margins": 0.566411018371582,
263
- "rewards/rejected": -1.0003626346588135,
264
  "step": 160
265
  },
266
  {
267
  "epoch": 0.35583464154892724,
268
- "grad_norm": 22.11997978366815,
269
  "learning_rate": 4.0668899744407567e-07,
270
- "logits/chosen": -1.5486009120941162,
271
- "logits/rejected": -1.4190781116485596,
272
- "logps/chosen": -326.2482604980469,
273
- "logps/rejected": -345.6896057128906,
274
- "loss": 0.5715,
275
- "rewards/accuracies": 0.6812499761581421,
276
- "rewards/chosen": -0.5852349996566772,
277
- "rewards/margins": 0.4395717680454254,
278
- "rewards/rejected": -1.0248068571090698,
279
  "step": 170
280
  },
281
  {
282
  "epoch": 0.37676609105180536,
283
- "grad_norm": 19.615157043086096,
284
  "learning_rate": 3.920161866827889e-07,
285
- "logits/chosen": -1.4963265657424927,
286
- "logits/rejected": -1.352418303489685,
287
- "logps/chosen": -322.8550720214844,
288
- "logps/rejected": -341.64959716796875,
289
- "loss": 0.535,
290
- "rewards/accuracies": 0.7437499761581421,
291
- "rewards/chosen": -0.45726776123046875,
292
- "rewards/margins": 0.46411681175231934,
293
- "rewards/rejected": -0.9213846325874329,
294
  "step": 180
295
  },
296
  {
297
  "epoch": 0.3976975405546834,
298
- "grad_norm": 22.15693930626595,
299
  "learning_rate": 3.765821230985757e-07,
300
- "logits/chosen": -1.2935736179351807,
301
- "logits/rejected": -1.2584232091903687,
302
- "logps/chosen": -317.90557861328125,
303
- "logps/rejected": -357.7715759277344,
304
- "loss": 0.5501,
305
- "rewards/accuracies": 0.668749988079071,
306
- "rewards/chosen": -0.546875536441803,
307
- "rewards/margins": 0.43576058745384216,
308
- "rewards/rejected": -0.9826361536979675,
309
  "step": 190
310
  },
311
  {
312
  "epoch": 0.4186289900575615,
313
- "grad_norm": 28.687486673954695,
314
  "learning_rate": 3.604695382782159e-07,
315
- "logits/chosen": -1.3054436445236206,
316
- "logits/rejected": -1.2638180255889893,
317
- "logps/chosen": -305.0118103027344,
318
- "logps/rejected": -359.88104248046875,
319
- "loss": 0.5721,
320
- "rewards/accuracies": 0.7124999761581421,
321
- "rewards/chosen": -0.4132348895072937,
322
- "rewards/margins": 0.4162468910217285,
323
- "rewards/rejected": -0.8294817209243774,
324
  "step": 200
325
  },
326
  {
327
  "epoch": 0.43956043956043955,
328
- "grad_norm": 18.79057150911231,
329
  "learning_rate": 3.4376480090239047e-07,
330
- "logits/chosen": -1.5015009641647339,
331
- "logits/rejected": -1.174789547920227,
332
- "logps/chosen": -379.56378173828125,
333
- "logps/rejected": -367.51824951171875,
334
- "loss": 0.5705,
335
- "rewards/accuracies": 0.7124999761581421,
336
- "rewards/chosen": -0.591296374797821,
337
- "rewards/margins": 0.5415098667144775,
338
- "rewards/rejected": -1.1328063011169434,
339
  "step": 210
340
  },
341
  {
342
  "epoch": 0.4604918890633176,
343
- "grad_norm": 18.737878732800926,
344
  "learning_rate": 3.265574537815398e-07,
345
- "logits/chosen": -1.2811614274978638,
346
- "logits/rejected": -1.1760584115982056,
347
- "logps/chosen": -330.80340576171875,
348
- "logps/rejected": -373.9013366699219,
349
- "loss": 0.5552,
350
- "rewards/accuracies": 0.699999988079071,
351
- "rewards/chosen": -0.7049997448921204,
352
- "rewards/margins": 0.5080182552337646,
353
- "rewards/rejected": -1.2130179405212402,
354
  "step": 220
355
  },
356
  {
357
  "epoch": 0.48142333856619574,
358
- "grad_norm": 21.237231658325168,
359
  "learning_rate": 3.0893973387735683e-07,
360
- "logits/chosen": -1.5031620264053345,
361
- "logits/rejected": -1.2624866962432861,
362
- "logps/chosen": -332.4327697753906,
363
- "logps/rejected": -387.6866149902344,
364
- "loss": 0.5585,
365
- "rewards/accuracies": 0.75,
366
- "rewards/chosen": -0.6327264904975891,
367
- "rewards/margins": 0.7525407075881958,
368
- "rewards/rejected": -1.3852671384811401,
369
  "step": 230
370
  },
371
  {
372
  "epoch": 0.5023547880690737,
373
- "grad_norm": 22.484006804145768,
374
  "learning_rate": 2.910060778827554e-07,
375
- "logits/chosen": -1.5307586193084717,
376
- "logits/rejected": -1.3196094036102295,
377
- "logps/chosen": -347.9156799316406,
378
- "logps/rejected": -378.3223571777344,
379
- "loss": 0.5217,
380
- "rewards/accuracies": 0.71875,
381
- "rewards/chosen": -0.5113299489021301,
382
- "rewards/margins": 0.5874797105789185,
383
- "rewards/rejected": -1.0988094806671143,
384
  "step": 240
385
  },
386
  {
387
  "epoch": 0.5232862375719518,
388
- "grad_norm": 21.947882128729784,
389
  "learning_rate": 2.7285261601056697e-07,
390
- "logits/chosen": -1.339247465133667,
391
- "logits/rejected": -1.0327800512313843,
392
- "logps/chosen": -349.29168701171875,
393
- "logps/rejected": -384.4781494140625,
394
- "loss": 0.5379,
395
- "rewards/accuracies": 0.7749999761581421,
396
- "rewards/chosen": -0.6346049308776855,
397
- "rewards/margins": 0.722966194152832,
398
- "rewards/rejected": -1.357571005821228,
399
  "step": 250
400
  },
401
  {
402
  "epoch": 0.54421768707483,
403
- "grad_norm": 26.05656063347704,
404
  "learning_rate": 2.5457665670441937e-07,
405
- "logits/chosen": -1.2346374988555908,
406
- "logits/rejected": -1.1367831230163574,
407
- "logps/chosen": -348.9411315917969,
408
- "logps/rejected": -383.9248046875,
409
- "loss": 0.5433,
410
- "rewards/accuracies": 0.675000011920929,
411
- "rewards/chosen": -0.7850319147109985,
412
- "rewards/margins": 0.5753322839736938,
413
- "rewards/rejected": -1.3603640794754028,
414
  "step": 260
415
  },
416
  {
417
  "epoch": 0.565149136577708,
418
- "grad_norm": 16.33289220353537,
419
  "learning_rate": 2.3627616503391812e-07,
420
- "logits/chosen": -1.3926626443862915,
421
- "logits/rejected": -1.2385615110397339,
422
- "logps/chosen": -355.91156005859375,
423
- "logps/rejected": -402.8101501464844,
424
- "loss": 0.538,
425
- "rewards/accuracies": 0.75,
426
- "rewards/chosen": -0.5515707731246948,
427
- "rewards/margins": 0.7438338994979858,
428
- "rewards/rejected": -1.2954046726226807,
429
  "step": 270
430
  },
431
  {
432
  "epoch": 0.5860805860805861,
433
- "grad_norm": 21.47375551562495,
434
  "learning_rate": 2.1804923757009882e-07,
435
- "logits/chosen": -1.2104097604751587,
436
- "logits/rejected": -1.0112183094024658,
437
- "logps/chosen": -337.76531982421875,
438
- "logps/rejected": -353.09912109375,
439
- "loss": 0.5436,
440
- "rewards/accuracies": 0.6937500238418579,
441
- "rewards/chosen": -0.6104549169540405,
442
- "rewards/margins": 0.5441664457321167,
443
- "rewards/rejected": -1.1546214818954468,
444
  "step": 280
445
  },
446
  {
447
  "epoch": 0.6070120355834642,
448
- "grad_norm": 20.910890345984367,
449
  "learning_rate": 1.9999357655598891e-07,
450
- "logits/chosen": -1.2669506072998047,
451
- "logits/rejected": -1.1004546880722046,
452
- "logps/chosen": -335.47161865234375,
453
- "logps/rejected": -385.6603698730469,
454
- "loss": 0.5427,
455
- "rewards/accuracies": 0.75,
456
- "rewards/chosen": -0.8220598101615906,
457
- "rewards/margins": 0.5405200719833374,
458
- "rewards/rejected": -1.3625797033309937,
459
  "step": 290
460
  },
461
  {
462
  "epoch": 0.6279434850863422,
463
- "grad_norm": 18.856326831181125,
464
  "learning_rate": 1.8220596619089573e-07,
465
- "logits/chosen": -1.356403112411499,
466
- "logits/rejected": -1.1351536512374878,
467
- "logps/chosen": -395.8989562988281,
468
- "logps/rejected": -410.227294921875,
469
- "loss": 0.5164,
470
- "rewards/accuracies": 0.7250000238418579,
471
- "rewards/chosen": -0.6085205078125,
472
- "rewards/margins": 0.6207507848739624,
473
- "rewards/rejected": -1.2292712926864624,
474
  "step": 300
475
  },
476
  {
477
  "epoch": 0.6488749345892203,
478
- "grad_norm": 22.76686226367157,
479
  "learning_rate": 1.647817538357072e-07,
480
- "logits/chosen": -1.3988535404205322,
481
- "logits/rejected": -1.2010142803192139,
482
- "logps/chosen": -364.18408203125,
483
- "logps/rejected": -386.00714111328125,
484
- "loss": 0.5228,
485
- "rewards/accuracies": 0.8187500238418579,
486
- "rewards/chosen": -0.5230848789215088,
487
- "rewards/margins": 0.784976601600647,
488
- "rewards/rejected": -1.3080614805221558,
489
  "step": 310
490
  },
491
  {
492
  "epoch": 0.6698063840920984,
493
- "grad_norm": 18.988874912843485,
494
  "learning_rate": 1.478143389201113e-07,
495
- "logits/chosen": -1.226210117340088,
496
- "logits/rejected": -0.9954258799552917,
497
- "logps/chosen": -328.40216064453125,
498
- "logps/rejected": -369.4967041015625,
499
- "loss": 0.5151,
500
- "rewards/accuracies": 0.8062499761581421,
501
- "rewards/chosen": -0.63496994972229,
502
- "rewards/margins": 0.7474745512008667,
503
- "rewards/rejected": -1.3824446201324463,
504
  "step": 320
505
  },
506
  {
507
  "epoch": 0.6907378335949764,
508
- "grad_norm": 20.540627681342297,
509
  "learning_rate": 1.3139467229135998e-07,
510
- "logits/chosen": -1.3592547178268433,
511
- "logits/rejected": -1.3184325695037842,
512
- "logps/chosen": -331.37701416015625,
513
- "logps/rejected": -393.223876953125,
514
- "loss": 0.5226,
515
- "rewards/accuracies": 0.78125,
516
- "rewards/chosen": -0.5516895055770874,
517
- "rewards/margins": 0.6636101007461548,
518
- "rewards/rejected": -1.2152996063232422,
519
  "step": 330
520
  },
521
  {
522
  "epoch": 0.7116692830978545,
523
- "grad_norm": 25.42987653920307,
524
  "learning_rate": 1.1561076868822755e-07,
525
- "logits/chosen": -1.2324409484863281,
526
- "logits/rejected": -1.0954737663269043,
527
- "logps/chosen": -362.8447265625,
528
- "logps/rejected": -403.8171081542969,
529
- "loss": 0.5213,
530
- "rewards/accuracies": 0.737500011920929,
531
- "rewards/chosen": -0.6163730621337891,
532
- "rewards/margins": 0.6569727659225464,
533
- "rewards/rejected": -1.273345708847046,
534
  "step": 340
535
  },
536
  {
537
  "epoch": 0.7326007326007326,
538
- "grad_norm": 20.6687240224169,
539
  "learning_rate": 1.0054723495346482e-07,
540
- "logits/chosen": -1.2725862264633179,
541
- "logits/rejected": -1.0163639783859253,
542
- "logps/chosen": -324.8491516113281,
543
- "logps/rejected": -372.7526550292969,
544
- "loss": 0.4955,
545
- "rewards/accuracies": 0.800000011920929,
546
- "rewards/chosen": -0.5732508897781372,
547
- "rewards/margins": 0.7606478929519653,
548
- "rewards/rejected": -1.333898663520813,
549
  "step": 350
550
  },
551
  {
552
  "epoch": 0.7535321821036107,
553
- "grad_norm": 19.600987167350965,
554
  "learning_rate": 8.628481651367875e-08,
555
- "logits/chosen": -1.1491715908050537,
556
- "logits/rejected": -0.8824012875556946,
557
- "logps/chosen": -378.06512451171875,
558
- "logps/rejected": -407.40948486328125,
559
- "loss": 0.5428,
560
- "rewards/accuracies": 0.737500011920929,
561
- "rewards/chosen": -0.6164501309394836,
562
- "rewards/margins": 0.7409273386001587,
563
- "rewards/rejected": -1.357377529144287,
564
  "step": 360
565
  },
566
  {
567
  "epoch": 0.7744636316064888,
568
- "grad_norm": 21.557147229171978,
569
  "learning_rate": 7.289996455765748e-08,
570
- "logits/chosen": -1.1593047380447388,
571
- "logits/rejected": -0.9495538473129272,
572
- "logps/chosen": -332.91485595703125,
573
- "logps/rejected": -369.48150634765625,
574
- "loss": 0.5286,
575
- "rewards/accuracies": 0.78125,
576
- "rewards/chosen": -0.571354329586029,
577
- "rewards/margins": 0.722070038318634,
578
- "rewards/rejected": -1.293424367904663,
579
  "step": 370
580
  },
581
  {
582
  "epoch": 0.7953950811093669,
583
- "grad_norm": 22.757905962081995,
584
  "learning_rate": 6.046442623320145e-08,
585
- "logits/chosen": -0.9251530766487122,
586
- "logits/rejected": -0.8378445506095886,
587
- "logps/chosen": -339.1557312011719,
588
- "logps/rejected": -443.68817138671875,
589
- "loss": 0.5192,
590
- "rewards/accuracies": 0.762499988079071,
591
- "rewards/chosen": -0.7877386212348938,
592
- "rewards/margins": 0.8901177644729614,
593
- "rewards/rejected": -1.6778564453125,
594
  "step": 380
595
  },
596
  {
597
  "epoch": 0.8163265306122449,
598
- "grad_norm": 26.244507090381283,
599
  "learning_rate": 4.904486005914027e-08,
600
- "logits/chosen": -1.2426486015319824,
601
- "logits/rejected": -1.0370407104492188,
602
- "logps/chosen": -425.2216796875,
603
- "logps/rejected": -453.7456970214844,
604
- "loss": 0.5139,
605
- "rewards/accuracies": 0.706250011920929,
606
- "rewards/chosen": -0.7326547503471375,
607
- "rewards/margins": 0.6568619012832642,
608
- "rewards/rejected": -1.3895165920257568,
609
  "step": 390
610
  },
611
  {
612
  "epoch": 0.837257980115123,
613
- "grad_norm": 24.429980563146632,
614
  "learning_rate": 3.8702478614051345e-08,
615
- "logits/chosen": -1.087192177772522,
616
- "logits/rejected": -0.863726794719696,
617
- "logps/chosen": -332.46685791015625,
618
- "logps/rejected": -387.232421875,
619
- "loss": 0.5321,
620
- "rewards/accuracies": 0.731249988079071,
621
- "rewards/chosen": -0.6953538060188293,
622
- "rewards/margins": 0.6994706988334656,
623
- "rewards/rejected": -1.394824504852295,
624
  "step": 400
625
  },
626
  {
627
  "epoch": 0.858189429618001,
628
- "grad_norm": 21.16777347186506,
629
  "learning_rate": 2.9492720416985e-08,
630
- "logits/chosen": -1.3283381462097168,
631
- "logits/rejected": -1.0876325368881226,
632
- "logps/chosen": -378.5984802246094,
633
- "logps/rejected": -414.51202392578125,
634
- "loss": 0.531,
635
- "rewards/accuracies": 0.7562500238418579,
636
- "rewards/chosen": -0.6937893629074097,
637
- "rewards/margins": 0.7333989143371582,
638
- "rewards/rejected": -1.4271881580352783,
639
  "step": 410
640
  },
641
  {
642
  "epoch": 0.8791208791208791,
643
- "grad_norm": 26.382095604314472,
644
  "learning_rate": 2.1464952759020856e-08,
645
- "logits/chosen": -1.1406913995742798,
646
- "logits/rejected": -0.9990569353103638,
647
- "logps/chosen": -338.43505859375,
648
- "logps/rejected": -411.83319091796875,
649
- "loss": 0.5126,
650
  "rewards/accuracies": 0.731249988079071,
651
- "rewards/chosen": -0.7397539019584656,
652
- "rewards/margins": 0.7144732475280762,
653
- "rewards/rejected": -1.4542272090911865,
654
  "step": 420
655
  },
656
  {
657
  "epoch": 0.9000523286237572,
658
- "grad_norm": 25.19851833189937,
659
  "learning_rate": 1.4662207078575684e-08,
660
- "logits/chosen": -1.1446809768676758,
661
- "logits/rejected": -0.9151161313056946,
662
- "logps/chosen": -371.24432373046875,
663
- "logps/rejected": -428.16436767578125,
664
- "loss": 0.5024,
665
- "rewards/accuracies": 0.768750011920929,
666
- "rewards/chosen": -0.682637095451355,
667
- "rewards/margins": 0.731964647769928,
668
- "rewards/rejected": -1.4146016836166382,
669
  "step": 430
670
  },
671
  {
672
  "epoch": 0.9209837781266352,
673
- "grad_norm": 23.916130656759286,
674
  "learning_rate": 9.12094829893642e-09,
675
- "logits/chosen": -1.3279728889465332,
676
- "logits/rejected": -1.114848017692566,
677
- "logps/chosen": -332.7235412597656,
678
- "logps/rejected": -360.280029296875,
679
- "loss": 0.5197,
680
- "rewards/accuracies": 0.6937500238418579,
681
- "rewards/chosen": -0.6455050706863403,
682
- "rewards/margins": 0.6436306238174438,
683
- "rewards/rejected": -1.2891355752944946,
684
  "step": 440
685
  },
686
  {
687
  "epoch": 0.9419152276295133,
688
- "grad_norm": 22.16715594060594,
689
  "learning_rate": 4.8708793644441086e-09,
690
- "logits/chosen": -1.0243772268295288,
691
- "logits/rejected": -0.8716105222702026,
692
- "logps/chosen": -358.4526672363281,
693
- "logps/rejected": -418.63916015625,
694
- "loss": 0.5172,
695
- "rewards/accuracies": 0.75,
696
- "rewards/chosen": -0.7147374749183655,
697
- "rewards/margins": 0.7633405923843384,
698
- "rewards/rejected": -1.4780781269073486,
699
  "step": 450
700
  },
701
  {
702
  "epoch": 0.9628466771323915,
703
- "grad_norm": 39.08384701109071,
704
  "learning_rate": 1.9347820230782295e-09,
705
- "logits/chosen": -1.2375072240829468,
706
- "logits/rejected": -0.9246547818183899,
707
- "logps/chosen": -354.1645812988281,
708
- "logps/rejected": -374.7205810546875,
709
- "loss": 0.5302,
710
  "rewards/accuracies": 0.762499988079071,
711
- "rewards/chosen": -0.7204490900039673,
712
- "rewards/margins": 0.6771665811538696,
713
- "rewards/rejected": -1.3976157903671265,
714
  "step": 460
715
  },
716
  {
717
  "epoch": 0.9837781266352695,
718
- "grad_norm": 21.008394137433164,
719
  "learning_rate": 3.2839470889836627e-10,
720
- "logits/chosen": -1.2062015533447266,
721
- "logits/rejected": -1.0119010210037231,
722
- "logps/chosen": -373.7929992675781,
723
- "logps/rejected": -409.3150634765625,
724
- "loss": 0.5174,
725
- "rewards/accuracies": 0.706250011920929,
726
- "rewards/chosen": -0.6295822858810425,
727
- "rewards/margins": 0.6495110392570496,
728
- "rewards/rejected": -1.2790933847427368,
729
  "step": 470
730
  },
731
  {
732
  "epoch": 0.9984301412872841,
733
- "eval_logits/chosen": -1.0303078889846802,
734
- "eval_logits/rejected": -0.8834976553916931,
735
- "eval_logps/chosen": -334.1723937988281,
736
- "eval_logps/rejected": -409.29547119140625,
737
- "eval_loss": 0.5259878039360046,
738
- "eval_rewards/accuracies": 0.77734375,
739
- "eval_rewards/chosen": -0.6380884647369385,
740
- "eval_rewards/margins": 0.7834274172782898,
741
- "eval_rewards/rejected": -1.421515941619873,
742
- "eval_runtime": 167.8072,
743
- "eval_samples_per_second": 11.918,
744
- "eval_steps_per_second": 0.191,
745
  "step": 477
746
  },
747
  {
748
  "epoch": 0.9984301412872841,
749
  "step": 477,
750
  "total_flos": 0.0,
751
- "train_loss": 0.5633771029658288,
752
- "train_runtime": 15455.4899,
753
- "train_samples_per_second": 3.955,
754
- "train_steps_per_second": 0.031
755
  }
756
  ],
757
  "logging_steps": 10,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0020931449502878076,
13
+ "grad_norm": 9.911216937670318,
14
  "learning_rate": 1.0416666666666666e-08,
15
+ "logits/chosen": -2.8090171813964844,
16
+ "logits/rejected": -2.7643635272979736,
17
+ "logps/chosen": -333.44940185546875,
18
+ "logps/rejected": -378.9651184082031,
19
  "loss": 0.6931,
20
  "rewards/accuracies": 0.0,
21
  "rewards/chosen": 0.0,
 
25
  },
26
  {
27
  "epoch": 0.020931449502878074,
28
+ "grad_norm": 9.008729965360628,
29
  "learning_rate": 1.0416666666666667e-07,
30
+ "logits/chosen": -2.597419261932373,
31
+ "logits/rejected": -2.561866521835327,
32
+ "logps/chosen": -323.94671630859375,
33
+ "logps/rejected": -288.5050048828125,
34
+ "loss": 0.6932,
35
+ "rewards/accuracies": 0.3819444477558136,
36
+ "rewards/chosen": 0.0003000612196046859,
37
+ "rewards/margins": -0.00015054795949254185,
38
+ "rewards/rejected": 0.00045060913544148207,
39
  "step": 10
40
  },
41
  {
42
  "epoch": 0.04186289900575615,
43
+ "grad_norm": 7.902884214874011,
44
  "learning_rate": 2.0833333333333333e-07,
45
+ "logits/chosen": -2.6865365505218506,
46
+ "logits/rejected": -2.6199164390563965,
47
+ "logps/chosen": -331.81707763671875,
48
+ "logps/rejected": -296.70428466796875,
49
+ "loss": 0.6929,
50
+ "rewards/accuracies": 0.53125,
51
+ "rewards/chosen": 9.686091652838513e-05,
52
+ "rewards/margins": -0.00012469211651477963,
53
+ "rewards/rejected": 0.0002215529966633767,
54
  "step": 20
55
  },
56
  {
57
  "epoch": 0.06279434850863422,
58
+ "grad_norm": 8.230332507186178,
59
  "learning_rate": 3.1249999999999997e-07,
60
+ "logits/chosen": -2.6669394969940186,
61
+ "logits/rejected": -2.5895800590515137,
62
+ "logps/chosen": -310.1693420410156,
63
+ "logps/rejected": -260.9254455566406,
64
+ "loss": 0.6912,
65
  "rewards/accuracies": 0.625,
66
+ "rewards/chosen": 0.003129849676042795,
67
+ "rewards/margins": 0.004864652641117573,
68
+ "rewards/rejected": -0.0017348021501675248,
69
  "step": 30
70
  },
71
  {
72
  "epoch": 0.0837257980115123,
73
+ "grad_norm": 8.00937157715773,
74
  "learning_rate": 4.1666666666666667e-07,
75
+ "logits/chosen": -2.5729217529296875,
76
+ "logits/rejected": -2.530136823654175,
77
+ "logps/chosen": -285.17449951171875,
78
+ "logps/rejected": -272.303955078125,
79
+ "loss": 0.6863,
80
+ "rewards/accuracies": 0.6312500238418579,
81
+ "rewards/chosen": 0.0071965730749070644,
82
+ "rewards/margins": 0.013435715809464455,
83
+ "rewards/rejected": -0.006239141337573528,
84
  "step": 40
85
  },
86
  {
87
  "epoch": 0.10465724751439037,
88
+ "grad_norm": 8.079059688602763,
89
  "learning_rate": 4.999731868769026e-07,
90
+ "logits/chosen": -2.601407051086426,
91
+ "logits/rejected": -2.5244762897491455,
92
+ "logps/chosen": -292.8643798828125,
93
+ "logps/rejected": -286.71905517578125,
94
+ "loss": 0.6785,
95
+ "rewards/accuracies": 0.65625,
96
+ "rewards/chosen": 0.011629783548414707,
97
+ "rewards/margins": 0.02448815107345581,
98
+ "rewards/rejected": -0.012858365662395954,
99
  "step": 50
100
  },
101
  {
102
  "epoch": 0.12558869701726844,
103
+ "grad_norm": 9.017052709484783,
104
  "learning_rate": 4.990353313429303e-07,
105
+ "logits/chosen": -2.6307859420776367,
106
+ "logits/rejected": -2.5665087699890137,
107
+ "logps/chosen": -265.0750732421875,
108
+ "logps/rejected": -258.7865905761719,
109
+ "loss": 0.6675,
110
+ "rewards/accuracies": 0.768750011920929,
111
+ "rewards/chosen": 0.021959755569696426,
112
+ "rewards/margins": 0.06315209716558456,
113
+ "rewards/rejected": -0.04119233787059784,
114
  "step": 60
115
  },
116
  {
117
  "epoch": 0.14652014652014653,
118
+ "grad_norm": 8.264582912076714,
119
  "learning_rate": 4.967625656594781e-07,
120
+ "logits/chosen": -2.5459885597229004,
121
+ "logits/rejected": -2.5088202953338623,
122
+ "logps/chosen": -312.3421630859375,
123
+ "logps/rejected": -306.2401428222656,
124
+ "loss": 0.6493,
125
+ "rewards/accuracies": 0.699999988079071,
126
+ "rewards/chosen": -0.004852661397308111,
127
+ "rewards/margins": 0.10119061172008514,
128
+ "rewards/rejected": -0.1060432642698288,
129
  "step": 70
130
  },
131
  {
132
  "epoch": 0.1674515960230246,
133
+ "grad_norm": 10.960281749614031,
134
  "learning_rate": 4.93167072587771e-07,
135
+ "logits/chosen": -2.6571133136749268,
136
+ "logits/rejected": -2.5383903980255127,
137
+ "logps/chosen": -354.3619689941406,
138
+ "logps/rejected": -291.69244384765625,
139
+ "loss": 0.6364,
140
+ "rewards/accuracies": 0.71875,
141
+ "rewards/chosen": -0.05542845278978348,
142
+ "rewards/margins": 0.1628389060497284,
143
+ "rewards/rejected": -0.21826735138893127,
144
  "step": 80
145
  },
146
  {
147
  "epoch": 0.18838304552590268,
148
+ "grad_norm": 13.098090567021211,
149
  "learning_rate": 4.882681251368548e-07,
150
+ "logits/chosen": -2.6158549785614014,
151
+ "logits/rejected": -2.5619349479675293,
152
+ "logps/chosen": -288.7886047363281,
153
+ "logps/rejected": -311.8244934082031,
154
+ "loss": 0.6093,
155
+ "rewards/accuracies": 0.699999988079071,
156
+ "rewards/chosen": -0.21366631984710693,
157
+ "rewards/margins": 0.22560691833496094,
158
+ "rewards/rejected": -0.4392732083797455,
159
  "step": 90
160
  },
161
  {
162
  "epoch": 0.20931449502878074,
163
+ "grad_norm": 10.707751058389576,
164
  "learning_rate": 4.820919832540181e-07,
165
+ "logits/chosen": -2.6393892765045166,
166
+ "logits/rejected": -2.5948047637939453,
167
+ "logps/chosen": -343.1591491699219,
168
+ "logps/rejected": -353.9322814941406,
169
+ "loss": 0.6065,
170
+ "rewards/accuracies": 0.699999988079071,
171
+ "rewards/chosen": -0.2791406810283661,
172
+ "rewards/margins": 0.35397782921791077,
173
+ "rewards/rejected": -0.6331185102462769,
174
  "step": 100
175
  },
176
  {
177
  "epoch": 0.2302459445316588,
178
+ "grad_norm": 12.745950954246485,
179
  "learning_rate": 4.7467175306295647e-07,
180
+ "logits/chosen": -2.711115837097168,
181
+ "logits/rejected": -2.6523733139038086,
182
+ "logps/chosen": -335.3150634765625,
183
+ "logps/rejected": -345.87237548828125,
184
+ "loss": 0.6078,
185
+ "rewards/accuracies": 0.6937500238418579,
186
+ "rewards/chosen": -0.2736280560493469,
187
+ "rewards/margins": 0.2958160936832428,
188
+ "rewards/rejected": -0.5694441795349121,
189
  "step": 110
190
  },
191
  {
192
  "epoch": 0.25117739403453687,
193
+ "grad_norm": 12.922005253605358,
194
  "learning_rate": 4.6604720940421207e-07,
195
+ "logits/chosen": -2.643698215484619,
196
+ "logits/rejected": -2.6443088054656982,
197
+ "logps/chosen": -316.6920166015625,
198
+ "logps/rejected": -346.8565979003906,
199
+ "loss": 0.5867,
200
+ "rewards/accuracies": 0.71875,
201
+ "rewards/chosen": -0.32213813066482544,
202
+ "rewards/margins": 0.4012001156806946,
203
+ "rewards/rejected": -0.7233381867408752,
204
  "step": 120
205
  },
206
  {
207
  "epoch": 0.272108843537415,
208
+ "grad_norm": 14.301862772426597,
209
  "learning_rate": 4.5626458262912735e-07,
210
+ "logits/chosen": -2.71155047416687,
211
+ "logits/rejected": -2.6922905445098877,
212
+ "logps/chosen": -340.5058288574219,
213
+ "logps/rejected": -356.29541015625,
214
+ "loss": 0.568,
215
+ "rewards/accuracies": 0.6875,
216
+ "rewards/chosen": -0.40051087737083435,
217
+ "rewards/margins": 0.34178003668785095,
218
+ "rewards/rejected": -0.7422909140586853,
219
  "step": 130
220
  },
221
  {
222
  "epoch": 0.29304029304029305,
223
+ "grad_norm": 19.29337794069041,
224
  "learning_rate": 4.453763107901675e-07,
225
+ "logits/chosen": -2.7858147621154785,
226
+ "logits/rejected": -2.755225419998169,
227
+ "logps/chosen": -368.2306213378906,
228
+ "logps/rejected": -366.3923645019531,
229
+ "loss": 0.5792,
230
  "rewards/accuracies": 0.7562500238418579,
231
+ "rewards/chosen": -0.3208185136318207,
232
+ "rewards/margins": 0.485020250082016,
233
+ "rewards/rejected": -0.8058387637138367,
234
  "step": 140
235
  },
236
  {
237
  "epoch": 0.3139717425431711,
238
+ "grad_norm": 20.243585876180912,
239
  "learning_rate": 4.3344075855595097e-07,
240
+ "logits/chosen": -2.7599551677703857,
241
+ "logits/rejected": -2.7298169136047363,
242
+ "logps/chosen": -338.7522888183594,
243
+ "logps/rejected": -336.2152404785156,
244
+ "loss": 0.5849,
245
+ "rewards/accuracies": 0.706250011920929,
246
+ "rewards/chosen": -0.4120866358280182,
247
+ "rewards/margins": 0.35028699040412903,
248
+ "rewards/rejected": -0.7623735666275024,
249
  "step": 150
250
  },
251
  {
252
  "epoch": 0.3349031920460492,
253
+ "grad_norm": 18.324736579566704,
254
  "learning_rate": 4.2052190435769554e-07,
255
+ "logits/chosen": -2.8034064769744873,
256
+ "logits/rejected": -2.7623839378356934,
257
+ "logps/chosen": -342.80474853515625,
258
+ "logps/rejected": -357.0664978027344,
259
+ "loss": 0.5882,
260
+ "rewards/accuracies": 0.7250000238418579,
261
+ "rewards/chosen": -0.5081926584243774,
262
+ "rewards/margins": 0.4391111433506012,
263
+ "rewards/rejected": -0.9473037719726562,
264
  "step": 160
265
  },
266
  {
267
  "epoch": 0.35583464154892724,
268
+ "grad_norm": 16.282660960565977,
269
  "learning_rate": 4.0668899744407567e-07,
270
+ "logits/chosen": -2.645301103591919,
271
+ "logits/rejected": -2.628730058670044,
272
+ "logps/chosen": -332.9082946777344,
273
+ "logps/rejected": -339.78851318359375,
274
+ "loss": 0.5924,
275
+ "rewards/accuracies": 0.668749988079071,
276
+ "rewards/chosen": -0.5716486573219299,
277
+ "rewards/margins": 0.3343070447444916,
278
+ "rewards/rejected": -0.9059556722640991,
279
  "step": 170
280
  },
281
  {
282
  "epoch": 0.37676609105180536,
283
+ "grad_norm": 15.21087493482888,
284
  "learning_rate": 3.920161866827889e-07,
285
+ "logits/chosen": -2.7087159156799316,
286
+ "logits/rejected": -2.672781467437744,
287
+ "logps/chosen": -327.3506774902344,
288
+ "logps/rejected": -333.36383056640625,
289
+ "loss": 0.5527,
290
+ "rewards/accuracies": 0.706250011920929,
291
+ "rewards/chosen": -0.4208357334136963,
292
+ "rewards/margins": 0.3587748408317566,
293
+ "rewards/rejected": -0.7796105742454529,
294
  "step": 180
295
  },
296
  {
297
  "epoch": 0.3976975405546834,
298
+ "grad_norm": 17.143640359397452,
299
  "learning_rate": 3.765821230985757e-07,
300
+ "logits/chosen": -2.6293439865112305,
301
+ "logits/rejected": -2.5997588634490967,
302
+ "logps/chosen": -319.8256530761719,
303
+ "logps/rejected": -352.70550537109375,
304
+ "loss": 0.563,
305
+ "rewards/accuracies": 0.6937500238418579,
306
+ "rewards/chosen": -0.47969502210617065,
307
+ "rewards/margins": 0.37083858251571655,
308
+ "rewards/rejected": -0.850533664226532,
309
  "step": 190
310
  },
311
  {
312
  "epoch": 0.4186289900575615,
313
+ "grad_norm": 58.63667616094113,
314
  "learning_rate": 3.604695382782159e-07,
315
+ "logits/chosen": -2.535177230834961,
316
+ "logits/rejected": -2.538287878036499,
317
+ "logps/chosen": -317.7572937011719,
318
+ "logps/rejected": -378.38128662109375,
319
+ "loss": 0.5741,
320
+ "rewards/accuracies": 0.706250011920929,
321
+ "rewards/chosen": -0.5162296295166016,
322
+ "rewards/margins": 0.4109058976173401,
323
+ "rewards/rejected": -0.9271354675292969,
324
  "step": 200
325
  },
326
  {
327
  "epoch": 0.43956043956043955,
328
+ "grad_norm": 22.07041734817236,
329
  "learning_rate": 3.4376480090239047e-07,
330
+ "logits/chosen": -2.5014660358428955,
331
+ "logits/rejected": -2.402636766433716,
332
+ "logps/chosen": -374.38275146484375,
333
+ "logps/rejected": -362.0113830566406,
334
+ "loss": 0.5684,
335
+ "rewards/accuracies": 0.71875,
336
+ "rewards/chosen": -0.46330127120018005,
337
+ "rewards/margins": 0.5060497522354126,
338
+ "rewards/rejected": -0.9693509936332703,
339
  "step": 210
340
  },
341
  {
342
  "epoch": 0.4604918890633176,
343
+ "grad_norm": 19.601082604792087,
344
  "learning_rate": 3.265574537815398e-07,
345
+ "logits/chosen": -2.2485015392303467,
346
+ "logits/rejected": -2.2393312454223633,
347
+ "logps/chosen": -319.60406494140625,
348
+ "logps/rejected": -370.166259765625,
349
+ "loss": 0.562,
350
+ "rewards/accuracies": 0.7437499761581421,
351
+ "rewards/chosen": -0.6189771294593811,
352
+ "rewards/margins": 0.5012843608856201,
353
+ "rewards/rejected": -1.1202614307403564,
354
  "step": 220
355
  },
356
  {
357
  "epoch": 0.48142333856619574,
358
+ "grad_norm": 18.201365099494677,
359
  "learning_rate": 3.0893973387735683e-07,
360
+ "logits/chosen": -2.3294405937194824,
361
+ "logits/rejected": -2.226823568344116,
362
+ "logps/chosen": -347.8485412597656,
363
+ "logps/rejected": -399.61407470703125,
364
+ "loss": 0.5531,
365
+ "rewards/accuracies": 0.768750011920929,
366
+ "rewards/chosen": -0.6986296772956848,
367
+ "rewards/margins": 0.7771102786064148,
368
+ "rewards/rejected": -1.4757399559020996,
369
  "step": 230
370
  },
371
  {
372
  "epoch": 0.5023547880690737,
373
+ "grad_norm": 17.978204290263523,
374
  "learning_rate": 2.910060778827554e-07,
375
+ "logits/chosen": -2.3042497634887695,
376
+ "logits/rejected": -2.198915481567383,
377
+ "logps/chosen": -367.82342529296875,
378
+ "logps/rejected": -391.17327880859375,
379
+ "loss": 0.5274,
380
+ "rewards/accuracies": 0.75,
381
+ "rewards/chosen": -0.5975519418716431,
382
+ "rewards/margins": 0.5712900161743164,
383
+ "rewards/rejected": -1.1688419580459595,
384
  "step": 240
385
  },
386
  {
387
  "epoch": 0.5232862375719518,
388
+ "grad_norm": 24.29328586959325,
389
  "learning_rate": 2.7285261601056697e-07,
390
+ "logits/chosen": -2.198782444000244,
391
+ "logits/rejected": -2.053729295730591,
392
+ "logps/chosen": -369.4089050292969,
393
+ "logps/rejected": -402.0597229003906,
394
+ "loss": 0.5322,
395
+ "rewards/accuracies": 0.793749988079071,
396
+ "rewards/chosen": -0.7310666441917419,
397
+ "rewards/margins": 0.732733428478241,
398
+ "rewards/rejected": -1.4638001918792725,
399
  "step": 250
400
  },
401
  {
402
  "epoch": 0.54421768707483,
403
+ "grad_norm": 22.871676623689975,
404
  "learning_rate": 2.5457665670441937e-07,
405
+ "logits/chosen": -2.0881309509277344,
406
+ "logits/rejected": -2.056945323944092,
407
+ "logps/chosen": -372.88616943359375,
408
+ "logps/rejected": -410.09320068359375,
409
+ "loss": 0.5374,
410
+ "rewards/accuracies": 0.6875,
411
+ "rewards/chosen": -0.9354459047317505,
412
+ "rewards/margins": 0.6017513275146484,
413
+ "rewards/rejected": -1.5371973514556885,
414
  "step": 260
415
  },
416
  {
417
  "epoch": 0.565149136577708,
418
+ "grad_norm": 18.143041901320185,
419
  "learning_rate": 2.3627616503391812e-07,
420
+ "logits/chosen": -2.0207080841064453,
421
+ "logits/rejected": -1.9440813064575195,
422
+ "logps/chosen": -368.6573791503906,
423
+ "logps/rejected": -404.0098876953125,
424
+ "loss": 0.541,
425
+ "rewards/accuracies": 0.731249988079071,
426
+ "rewards/chosen": -0.5764263868331909,
427
+ "rewards/margins": 0.6674422025680542,
428
+ "rewards/rejected": -1.2438685894012451,
429
  "step": 270
430
  },
431
  {
432
  "epoch": 0.5860805860805861,
433
+ "grad_norm": 24.238102577124657,
434
  "learning_rate": 2.1804923757009882e-07,
435
+ "logits/chosen": -1.797975778579712,
436
+ "logits/rejected": -1.6956411600112915,
437
+ "logps/chosen": -339.45184326171875,
438
+ "logps/rejected": -357.5285339355469,
439
+ "loss": 0.5477,
440
+ "rewards/accuracies": 0.71875,
441
+ "rewards/chosen": -0.5778101086616516,
442
+ "rewards/margins": 0.5612425804138184,
443
+ "rewards/rejected": -1.1390526294708252,
444
  "step": 280
445
  },
446
  {
447
  "epoch": 0.6070120355834642,
448
+ "grad_norm": 22.41015619268828,
449
  "learning_rate": 1.9999357655598891e-07,
450
+ "logits/chosen": -1.8829982280731201,
451
+ "logits/rejected": -1.7629162073135376,
452
+ "logps/chosen": -342.6054382324219,
453
+ "logps/rejected": -391.98907470703125,
454
+ "loss": 0.5392,
455
+ "rewards/accuracies": 0.7437499761581421,
456
+ "rewards/chosen": -0.8052763938903809,
457
+ "rewards/margins": 0.5734516382217407,
458
+ "rewards/rejected": -1.3787280321121216,
459
  "step": 290
460
  },
461
  {
462
  "epoch": 0.6279434850863422,
463
+ "grad_norm": 19.200480239691995,
464
  "learning_rate": 1.8220596619089573e-07,
465
+ "logits/chosen": -1.8666107654571533,
466
+ "logits/rejected": -1.7327511310577393,
467
+ "logps/chosen": -412.94073486328125,
468
+ "logps/rejected": -428.00225830078125,
469
+ "loss": 0.5268,
470
+ "rewards/accuracies": 0.6875,
471
+ "rewards/chosen": -0.7397323846817017,
472
+ "rewards/margins": 0.6389673948287964,
473
+ "rewards/rejected": -1.3786996603012085,
474
  "step": 300
475
  },
476
  {
477
  "epoch": 0.6488749345892203,
478
+ "grad_norm": 23.166055643365755,
479
  "learning_rate": 1.647817538357072e-07,
480
+ "logits/chosen": -1.8956129550933838,
481
+ "logits/rejected": -1.7741343975067139,
482
+ "logps/chosen": -382.96661376953125,
483
+ "logps/rejected": -403.57830810546875,
484
+ "loss": 0.5199,
485
+ "rewards/accuracies": 0.7562500238418579,
486
+ "rewards/chosen": -0.6354427933692932,
487
+ "rewards/margins": 0.7677633762359619,
488
+ "rewards/rejected": -1.4032061100006104,
489
  "step": 310
490
  },
491
  {
492
  "epoch": 0.6698063840920984,
493
+ "grad_norm": 21.777286765570032,
494
  "learning_rate": 1.478143389201113e-07,
495
+ "logits/chosen": -1.805687665939331,
496
+ "logits/rejected": -1.6587250232696533,
497
+ "logps/chosen": -335.69287109375,
498
+ "logps/rejected": -379.68743896484375,
499
+ "loss": 0.5135,
500
+ "rewards/accuracies": 0.831250011920929,
501
+ "rewards/chosen": -0.6119715571403503,
502
+ "rewards/margins": 0.7994168996810913,
503
+ "rewards/rejected": -1.4113883972167969,
504
  "step": 320
505
  },
506
  {
507
  "epoch": 0.6907378335949764,
508
+ "grad_norm": 23.11844824124375,
509
  "learning_rate": 1.3139467229135998e-07,
510
+ "logits/chosen": -1.8413807153701782,
511
+ "logits/rejected": -1.8144184350967407,
512
+ "logps/chosen": -349.6283264160156,
513
+ "logps/rejected": -406.96295166015625,
514
+ "loss": 0.5224,
515
+ "rewards/accuracies": 0.706250011920929,
516
+ "rewards/chosen": -0.6430560350418091,
517
+ "rewards/margins": 0.6470705270767212,
518
+ "rewards/rejected": -1.2901265621185303,
519
  "step": 330
520
  },
521
  {
522
  "epoch": 0.7116692830978545,
523
+ "grad_norm": 24.015437020386692,
524
  "learning_rate": 1.1561076868822755e-07,
525
+ "logits/chosen": -1.717903733253479,
526
+ "logits/rejected": -1.6341816186904907,
527
+ "logps/chosen": -378.47442626953125,
528
+ "logps/rejected": -422.95654296875,
529
+ "loss": 0.5249,
530
+ "rewards/accuracies": 0.71875,
531
+ "rewards/chosen": -0.7014169692993164,
532
+ "rewards/margins": 0.6979398131370544,
533
+ "rewards/rejected": -1.3993569612503052,
534
  "step": 340
535
  },
536
  {
537
  "epoch": 0.7326007326007326,
538
+ "grad_norm": 28.45852201826551,
539
  "learning_rate": 1.0054723495346482e-07,
540
+ "logits/chosen": -1.7926820516586304,
541
+ "logits/rejected": -1.6367231607437134,
542
+ "logps/chosen": -344.37890625,
543
+ "logps/rejected": -383.24432373046875,
544
+ "loss": 0.496,
545
+ "rewards/accuracies": 0.7437499761581421,
546
+ "rewards/chosen": -0.6705678701400757,
547
+ "rewards/margins": 0.6647164225578308,
548
+ "rewards/rejected": -1.3352842330932617,
549
  "step": 350
550
  },
551
  {
552
  "epoch": 0.7535321821036107,
553
+ "grad_norm": 20.39869216136763,
554
  "learning_rate": 8.628481651367875e-08,
555
+ "logits/chosen": -1.7070884704589844,
556
+ "logits/rejected": -1.5379220247268677,
557
+ "logps/chosen": -400.3146667480469,
558
+ "logps/rejected": -420.22015380859375,
559
+ "loss": 0.549,
560
+ "rewards/accuracies": 0.731249988079071,
561
+ "rewards/chosen": -0.7352498769760132,
562
+ "rewards/margins": 0.6947922706604004,
563
+ "rewards/rejected": -1.4300422668457031,
564
  "step": 360
565
  },
566
  {
567
  "epoch": 0.7744636316064888,
568
+ "grad_norm": 27.00468749784755,
569
  "learning_rate": 7.289996455765748e-08,
570
+ "logits/chosen": -1.6615266799926758,
571
+ "logits/rejected": -1.5225656032562256,
572
+ "logps/chosen": -355.3984069824219,
573
+ "logps/rejected": -392.2328186035156,
574
+ "loss": 0.5269,
575
+ "rewards/accuracies": 0.762499988079071,
576
+ "rewards/chosen": -0.7128881216049194,
577
+ "rewards/margins": 0.735795795917511,
578
+ "rewards/rejected": -1.4486840963363647,
579
  "step": 370
580
  },
581
  {
582
  "epoch": 0.7953950811093669,
583
+ "grad_norm": 20.51420624974921,
584
  "learning_rate": 6.046442623320145e-08,
585
+ "logits/chosen": -1.4373382329940796,
586
+ "logits/rejected": -1.3852484226226807,
587
+ "logps/chosen": -353.9813537597656,
588
+ "logps/rejected": -454.80145263671875,
589
+ "loss": 0.5166,
590
+ "rewards/accuracies": 0.731249988079071,
591
+ "rewards/chosen": -0.8685728311538696,
592
+ "rewards/margins": 0.8601717948913574,
593
+ "rewards/rejected": -1.7287447452545166,
594
  "step": 380
595
  },
596
  {
597
  "epoch": 0.8163265306122449,
598
+ "grad_norm": 30.849969753036305,
599
  "learning_rate": 4.904486005914027e-08,
600
+ "logits/chosen": -1.653955101966858,
601
+ "logits/rejected": -1.5138956308364868,
602
+ "logps/chosen": -446.8741149902344,
603
+ "logps/rejected": -475.45977783203125,
604
+ "loss": 0.5155,
605
+ "rewards/accuracies": 0.699999988079071,
606
+ "rewards/chosen": -0.8643356561660767,
607
+ "rewards/margins": 0.660868227481842,
608
+ "rewards/rejected": -1.525203824043274,
609
  "step": 390
610
  },
611
  {
612
  "epoch": 0.837257980115123,
613
+ "grad_norm": 23.018534146860215,
614
  "learning_rate": 3.8702478614051345e-08,
615
+ "logits/chosen": -1.5653860569000244,
616
+ "logits/rejected": -1.4222080707550049,
617
+ "logps/chosen": -345.37451171875,
618
+ "logps/rejected": -400.10565185546875,
619
+ "loss": 0.5294,
620
+ "rewards/accuracies": 0.7437499761581421,
621
+ "rewards/chosen": -0.7452888488769531,
622
+ "rewards/margins": 0.7274179458618164,
623
+ "rewards/rejected": -1.47270667552948,
624
  "step": 400
625
  },
626
  {
627
  "epoch": 0.858189429618001,
628
+ "grad_norm": 21.423785778797033,
629
  "learning_rate": 2.9492720416985e-08,
630
+ "logits/chosen": -1.7361204624176025,
631
+ "logits/rejected": -1.5790544748306274,
632
+ "logps/chosen": -392.1049499511719,
633
+ "logps/rejected": -428.69873046875,
634
+ "loss": 0.5345,
635
+ "rewards/accuracies": 0.762499988079071,
636
+ "rewards/chosen": -0.726129412651062,
637
+ "rewards/margins": 0.7564193606376648,
638
+ "rewards/rejected": -1.4825488328933716,
639
  "step": 410
640
  },
641
  {
642
  "epoch": 0.8791208791208791,
643
+ "grad_norm": 25.906686849547086,
644
  "learning_rate": 2.1464952759020856e-08,
645
+ "logits/chosen": -1.5480293035507202,
646
+ "logits/rejected": -1.462304711341858,
647
+ "logps/chosen": -354.43505859375,
648
+ "logps/rejected": -425.56005859375,
649
+ "loss": 0.5171,
650
  "rewards/accuracies": 0.731249988079071,
651
+ "rewards/chosen": -0.810673713684082,
652
+ "rewards/margins": 0.7154702544212341,
653
+ "rewards/rejected": -1.5261439085006714,
654
  "step": 420
655
  },
656
  {
657
  "epoch": 0.9000523286237572,
658
+ "grad_norm": 24.715416421591268,
659
  "learning_rate": 1.4662207078575684e-08,
660
+ "logits/chosen": -1.5542974472045898,
661
+ "logits/rejected": -1.385867714881897,
662
+ "logps/chosen": -381.9129943847656,
663
+ "logps/rejected": -429.78900146484375,
664
+ "loss": 0.5057,
665
+ "rewards/accuracies": 0.75,
666
+ "rewards/chosen": -0.7492018342018127,
667
+ "rewards/margins": 0.7330363988876343,
668
+ "rewards/rejected": -1.4822382926940918,
669
  "step": 430
670
  },
671
  {
672
  "epoch": 0.9209837781266352,
673
+ "grad_norm": 26.044110008316927,
674
  "learning_rate": 9.12094829893642e-09,
675
+ "logits/chosen": -1.6991631984710693,
676
+ "logits/rejected": -1.5472667217254639,
677
+ "logps/chosen": -346.50555419921875,
678
+ "logps/rejected": -379.22064208984375,
679
+ "loss": 0.5129,
680
+ "rewards/accuracies": 0.7124999761581421,
681
+ "rewards/chosen": -0.692093551158905,
682
+ "rewards/margins": 0.6859078407287598,
683
+ "rewards/rejected": -1.37800133228302,
684
  "step": 440
685
  },
686
  {
687
  "epoch": 0.9419152276295133,
688
+ "grad_norm": 20.37342934046794,
689
  "learning_rate": 4.8708793644441086e-09,
690
+ "logits/chosen": -1.4247163534164429,
691
+ "logits/rejected": -1.3255692720413208,
692
+ "logps/chosen": -370.42413330078125,
693
+ "logps/rejected": -431.6888732910156,
694
+ "loss": 0.5152,
695
+ "rewards/accuracies": 0.7250000238418579,
696
+ "rewards/chosen": -0.7839125394821167,
697
+ "rewards/margins": 0.773055374622345,
698
+ "rewards/rejected": -1.5569679737091064,
699
  "step": 450
700
  },
701
  {
702
  "epoch": 0.9628466771323915,
703
+ "grad_norm": 25.986771829001402,
704
  "learning_rate": 1.9347820230782295e-09,
705
+ "logits/chosen": -1.601322889328003,
706
+ "logits/rejected": -1.3769402503967285,
707
+ "logps/chosen": -367.97900390625,
708
+ "logps/rejected": -388.46136474609375,
709
+ "loss": 0.522,
710
  "rewards/accuracies": 0.762499988079071,
711
+ "rewards/chosen": -0.7656736969947815,
712
+ "rewards/margins": 0.7120579481124878,
713
+ "rewards/rejected": -1.4777315855026245,
714
  "step": 460
715
  },
716
  {
717
  "epoch": 0.9837781266352695,
718
+ "grad_norm": 23.11351679142545,
719
  "learning_rate": 3.2839470889836627e-10,
720
+ "logits/chosen": -1.5826936960220337,
721
+ "logits/rejected": -1.4511644840240479,
722
+ "logps/chosen": -392.7727966308594,
723
+ "logps/rejected": -427.092041015625,
724
+ "loss": 0.5147,
725
+ "rewards/accuracies": 0.7437499761581421,
726
+ "rewards/chosen": -0.6837440133094788,
727
+ "rewards/margins": 0.6925565600395203,
728
+ "rewards/rejected": -1.376300573348999,
729
  "step": 470
730
  },
731
  {
732
  "epoch": 0.9984301412872841,
733
+ "eval_logits/chosen": -1.4815254211425781,
734
+ "eval_logits/rejected": -1.3790271282196045,
735
+ "eval_logps/chosen": -348.12322998046875,
736
+ "eval_logps/rejected": -425.112060546875,
737
+ "eval_loss": 0.5279496908187866,
738
+ "eval_rewards/accuracies": 0.78125,
739
+ "eval_rewards/chosen": -0.6818673610687256,
740
+ "eval_rewards/margins": 0.8081312775611877,
741
+ "eval_rewards/rejected": -1.4899988174438477,
742
+ "eval_runtime": 169.0575,
743
+ "eval_samples_per_second": 11.83,
744
+ "eval_steps_per_second": 0.189,
745
  "step": 477
746
  },
747
  {
748
  "epoch": 0.9984301412872841,
749
  "step": 477,
750
  "total_flos": 0.0,
751
+ "train_loss": 0.5669088098737929,
752
+ "train_runtime": 15771.2037,
753
+ "train_samples_per_second": 3.876,
754
+ "train_steps_per_second": 0.03
755
  }
756
  ],
757
  "logging_steps": 10,