li-muyang commited on
Commit
6b0af92
·
verified ·
1 Parent(s): b08b6b1

Model save

Browse files
Files changed (4) hide show
  1. README.md +14 -14
  2. all_results.json +4 -4
  3. train_results.json +4 -4
  4. trainer_state.json +695 -695
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
- base_model: mistralai/Mistral-7B-v0.3
5
  tags:
6
  - trl
7
  - sft
@@ -18,9 +18,9 @@ should probably proofread and complete it, then remove this comment. -->
18
 
19
  # zephyr-7b-sft-full
20
 
21
- This model is a fine-tuned version of [mistralai/Mistral-7B-v0.3](https://huggingface.co/mistralai/Mistral-7B-v0.3) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
- - Loss: 0.9448
24
 
25
  ## Model description
26
 
@@ -39,7 +39,7 @@ More information needed
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
- - learning_rate: 2e-05
43
  - train_batch_size: 8
44
  - eval_batch_size: 16
45
  - seed: 42
@@ -57,16 +57,16 @@ The following hyperparameters were used during training:
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
- | 1.025 | 0.0923 | 100 | 1.0240 |
61
- | 1.033 | 0.1846 | 200 | 1.0464 |
62
- | 1.037 | 0.2769 | 300 | 1.0424 |
63
- | 1.0136 | 0.3692 | 400 | 1.0295 |
64
- | 1.0229 | 0.4615 | 500 | 1.0151 |
65
- | 0.9745 | 0.5538 | 600 | 0.9945 |
66
- | 0.9441 | 0.6461 | 700 | 0.9769 |
67
- | 0.9277 | 0.7383 | 800 | 0.9613 |
68
- | 0.9384 | 0.8306 | 900 | 0.9501 |
69
- | 0.9216 | 0.9229 | 1000 | 0.9448 |
70
 
71
 
72
  ### Framework versions
 
1
  ---
2
  library_name: transformers
3
  license: apache-2.0
4
+ base_model: mistralai/Mistral-7B-v0.1
5
  tags:
6
  - trl
7
  - sft
 
18
 
19
  # zephyr-7b-sft-full
20
 
21
+ This model is a fine-tuned version of [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) on the generator dataset.
22
  It achieves the following results on the evaluation set:
23
+ - Loss: 0.9293
24
 
25
  ## Model description
26
 
 
39
  ### Training hyperparameters
40
 
41
  The following hyperparameters were used during training:
42
+ - learning_rate: 1e-05
43
  - train_batch_size: 8
44
  - eval_batch_size: 16
45
  - seed: 42
 
57
 
58
  | Training Loss | Epoch | Step | Validation Loss |
59
  |:-------------:|:------:|:----:|:---------------:|
60
+ | 0.9896 | 0.0923 | 100 | 0.9893 |
61
+ | 0.9838 | 0.1846 | 200 | 0.9935 |
62
+ | 0.9853 | 0.2769 | 300 | 0.9881 |
63
+ | 0.9638 | 0.3692 | 400 | 0.9781 |
64
+ | 0.9745 | 0.4615 | 500 | 0.9680 |
65
+ | 0.9396 | 0.5538 | 600 | 0.9568 |
66
+ | 0.9176 | 0.6461 | 700 | 0.9465 |
67
+ | 0.9067 | 0.7383 | 800 | 0.9379 |
68
+ | 0.9221 | 0.8306 | 900 | 0.9320 |
69
+ | 0.9087 | 0.9229 | 1000 | 0.9293 |
70
 
71
 
72
  ### Framework versions
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
- "train_loss": 0.986508995762382,
5
- "train_runtime": 33955.1767,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 4.084,
8
- "train_steps_per_second": 0.032
9
  }
 
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
+ "train_loss": 0.9547446678880179,
5
+ "train_runtime": 38927.5133,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 3.563,
8
+ "train_steps_per_second": 0.028
9
  }
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
- "train_loss": 0.986508995762382,
5
- "train_runtime": 33955.1767,
6
  "train_samples": 207864,
7
- "train_samples_per_second": 4.084,
8
- "train_steps_per_second": 0.032
9
  }
 
1
  {
2
  "epoch": 0.9995385325334564,
3
  "total_flos": 453306954547200.0,
4
+ "train_loss": 0.9547446678880179,
5
+ "train_runtime": 38927.5133,
6
  "train_samples": 207864,
7
+ "train_samples_per_second": 3.563,
8
+ "train_steps_per_second": 0.028
9
  }
trainer_state.json CHANGED
@@ -10,1611 +10,1611 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.0009229349330872173,
13
- "grad_norm": 10.726049490177841,
14
- "learning_rate": 1.8348623853211012e-07,
15
- "loss": 1.1497,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0046146746654360865,
20
- "grad_norm": 8.900657171326609,
21
- "learning_rate": 9.174311926605506e-07,
22
- "loss": 1.1397,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.009229349330872173,
27
- "grad_norm": 3.735942046557525,
28
- "learning_rate": 1.8348623853211011e-06,
29
- "loss": 1.0694,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01384402399630826,
34
- "grad_norm": 2.8318178389546484,
35
- "learning_rate": 2.7522935779816517e-06,
36
- "loss": 1.0196,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.018458698661744346,
41
- "grad_norm": 2.114174455764573,
42
- "learning_rate": 3.6697247706422022e-06,
43
- "loss": 1.0136,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.023073373327180433,
48
- "grad_norm": 2.310487403214644,
49
- "learning_rate": 4.587155963302753e-06,
50
- "loss": 1.0042,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.02768804799261652,
55
- "grad_norm": 2.4801571550073933,
56
- "learning_rate": 5.504587155963303e-06,
57
- "loss": 0.9742,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.032302722658052604,
62
- "grad_norm": 2.902534165090561,
63
- "learning_rate": 6.422018348623854e-06,
64
- "loss": 1.0012,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03691739732348869,
69
- "grad_norm": 2.2959784184113547,
70
- "learning_rate": 7.3394495412844045e-06,
71
- "loss": 1.0086,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04153207198892478,
76
- "grad_norm": 2.3869379848167416,
77
- "learning_rate": 8.256880733944956e-06,
78
- "loss": 0.987,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.046146746654360866,
83
- "grad_norm": 2.0557468408917527,
84
- "learning_rate": 9.174311926605506e-06,
85
- "loss": 0.9824,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.050761421319796954,
90
- "grad_norm": 2.2160008605472874,
91
- "learning_rate": 1.0091743119266055e-05,
92
- "loss": 0.986,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.05537609598523304,
97
- "grad_norm": 2.1320443631302006,
98
- "learning_rate": 1.1009174311926607e-05,
99
- "loss": 1.0019,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05999077065066913,
104
- "grad_norm": 2.4823142489717016,
105
- "learning_rate": 1.1926605504587156e-05,
106
- "loss": 1.0048,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06460544531610521,
111
- "grad_norm": 2.5334243917693295,
112
- "learning_rate": 1.2844036697247708e-05,
113
- "loss": 0.9828,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.0692201199815413,
118
- "grad_norm": 2.436109706504398,
119
- "learning_rate": 1.3761467889908258e-05,
120
- "loss": 0.9931,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07383479464697738,
125
- "grad_norm": 2.1346723037619695,
126
- "learning_rate": 1.4678899082568809e-05,
127
- "loss": 0.986,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.07844946931241348,
132
- "grad_norm": 1.9789737582877578,
133
- "learning_rate": 1.559633027522936e-05,
134
- "loss": 1.0089,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08306414397784956,
139
- "grad_norm": 2.033387276422637,
140
- "learning_rate": 1.6513761467889912e-05,
141
- "loss": 1.0083,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08767881864328565,
146
- "grad_norm": 2.243076459165097,
147
- "learning_rate": 1.743119266055046e-05,
148
- "loss": 1.0175,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09229349330872173,
153
- "grad_norm": 1.953405447633714,
154
- "learning_rate": 1.834862385321101e-05,
155
- "loss": 1.025,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09229349330872173,
160
- "eval_loss": 1.0240174531936646,
161
- "eval_runtime": 714.5925,
162
- "eval_samples_per_second": 21.481,
163
- "eval_steps_per_second": 0.168,
164
  "step": 100
165
  },
166
  {
167
  "epoch": 0.09690816797415783,
168
- "grad_norm": 2.244939327354232,
169
- "learning_rate": 1.9266055045871563e-05,
170
- "loss": 1.0494,
171
  "step": 105
172
  },
173
  {
174
  "epoch": 0.10152284263959391,
175
- "grad_norm": 2.0565138965118406,
176
- "learning_rate": 1.9999947982262415e-05,
177
- "loss": 1.0345,
178
  "step": 110
179
  },
180
  {
181
  "epoch": 0.10613751730503,
182
- "grad_norm": 2.0332956026689386,
183
- "learning_rate": 1.9998127418269004e-05,
184
- "loss": 1.0454,
185
  "step": 115
186
  },
187
  {
188
  "epoch": 0.11075219197046608,
189
- "grad_norm": 2.1387249854354566,
190
- "learning_rate": 1.9993706508539968e-05,
191
- "loss": 1.0255,
192
  "step": 120
193
  },
194
  {
195
  "epoch": 0.11536686663590216,
196
- "grad_norm": 2.458364994524856,
197
- "learning_rate": 1.998668640288e-05,
198
- "loss": 1.0518,
199
  "step": 125
200
  },
201
  {
202
  "epoch": 0.11998154130133826,
203
- "grad_norm": 1.8099694655556404,
204
- "learning_rate": 1.997706892710117e-05,
205
- "loss": 1.0224,
206
  "step": 130
207
  },
208
  {
209
  "epoch": 0.12459621596677434,
210
- "grad_norm": 2.202808282950665,
211
- "learning_rate": 1.9964856582548094e-05,
212
- "loss": 1.0552,
213
  "step": 135
214
  },
215
  {
216
  "epoch": 0.12921089063221042,
217
- "grad_norm": 1.7439069720540679,
218
- "learning_rate": 1.9950052545447354e-05,
219
- "loss": 1.0509,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 0.13382556529764653,
224
- "grad_norm": 1.8051398779741403,
225
- "learning_rate": 1.993266066608142e-05,
226
- "loss": 1.0298,
227
  "step": 145
228
  },
229
  {
230
  "epoch": 0.1384402399630826,
231
- "grad_norm": 1.949872455091144,
232
- "learning_rate": 1.991268546778726e-05,
233
- "loss": 1.0152,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 0.1430549146285187,
238
- "grad_norm": 2.004864424785268,
239
- "learning_rate": 1.9890132145779885e-05,
240
- "loss": 1.0682,
241
  "step": 155
242
  },
243
  {
244
  "epoch": 0.14766958929395477,
245
- "grad_norm": 1.9881342422206065,
246
- "learning_rate": 1.986500656580118e-05,
247
- "loss": 1.0275,
248
  "step": 160
249
  },
250
  {
251
  "epoch": 0.15228426395939088,
252
- "grad_norm": 1.895358789872697,
253
- "learning_rate": 1.9837315262594307e-05,
254
- "loss": 1.0341,
255
  "step": 165
256
  },
257
  {
258
  "epoch": 0.15689893862482696,
259
- "grad_norm": 2.089231797232654,
260
- "learning_rate": 1.980706543820412e-05,
261
- "loss": 1.0367,
262
  "step": 170
263
  },
264
  {
265
  "epoch": 0.16151361329026304,
266
- "grad_norm": 1.7637365585658213,
267
- "learning_rate": 1.9774264960104056e-05,
268
- "loss": 1.0223,
269
  "step": 175
270
  },
271
  {
272
  "epoch": 0.16612828795569912,
273
- "grad_norm": 1.7824319082067301,
274
- "learning_rate": 1.9738922359149927e-05,
275
- "loss": 1.0352,
276
  "step": 180
277
  },
278
  {
279
  "epoch": 0.1707429626211352,
280
- "grad_norm": 1.982313038360383,
281
- "learning_rate": 1.9701046827361175e-05,
282
- "loss": 1.0386,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 0.1753576372865713,
287
- "grad_norm": 1.6545978849182734,
288
- "learning_rate": 1.9660648215530207e-05,
289
- "loss": 1.0247,
290
  "step": 190
291
  },
292
  {
293
  "epoch": 0.17997231195200739,
294
- "grad_norm": 1.782636456859102,
295
- "learning_rate": 1.9617737030660338e-05,
296
- "loss": 1.0305,
297
  "step": 195
298
  },
299
  {
300
  "epoch": 0.18458698661744347,
301
- "grad_norm": 1.8378678572944849,
302
- "learning_rate": 1.9572324433233122e-05,
303
- "loss": 1.033,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 0.18458698661744347,
308
- "eval_loss": 1.0464073419570923,
309
- "eval_runtime": 647.0101,
310
- "eval_samples_per_second": 23.725,
311
- "eval_steps_per_second": 0.185,
312
  "step": 200
313
  },
314
  {
315
  "epoch": 0.18920166128287955,
316
- "grad_norm": 1.6648522003545267,
317
- "learning_rate": 1.9524422234305677e-05,
318
- "loss": 1.0268,
319
  "step": 205
320
  },
321
  {
322
  "epoch": 0.19381633594831565,
323
- "grad_norm": 1.7206083361405007,
324
- "learning_rate": 1.9474042892438848e-05,
325
- "loss": 1.0104,
326
  "step": 210
327
  },
328
  {
329
  "epoch": 0.19843101061375173,
330
- "grad_norm": 2.7587017854007194,
331
- "learning_rate": 1.942119951045692e-05,
332
- "loss": 1.0338,
333
  "step": 215
334
  },
335
  {
336
  "epoch": 0.20304568527918782,
337
- "grad_norm": 1.7930536986404009,
338
- "learning_rate": 1.9365905832039814e-05,
339
- "loss": 1.0614,
340
  "step": 220
341
  },
342
  {
343
  "epoch": 0.2076603599446239,
344
- "grad_norm": 1.7259048611678551,
345
- "learning_rate": 1.9308176238148565e-05,
346
- "loss": 1.051,
347
  "step": 225
348
  },
349
  {
350
  "epoch": 0.21227503461006,
351
- "grad_norm": 1.9820066538391885,
352
- "learning_rate": 1.924802574328509e-05,
353
- "loss": 1.0259,
354
  "step": 230
355
  },
356
  {
357
  "epoch": 0.21688970927549608,
358
- "grad_norm": 1.9752532611378077,
359
- "learning_rate": 1.9185469991587166e-05,
360
- "loss": 1.045,
361
  "step": 235
362
  },
363
  {
364
  "epoch": 0.22150438394093216,
365
- "grad_norm": 1.6132930572572803,
366
- "learning_rate": 1.912052525275965e-05,
367
- "loss": 1.0343,
368
  "step": 240
369
  },
370
  {
371
  "epoch": 0.22611905860636825,
372
- "grad_norm": 1.7584131810475476,
373
- "learning_rate": 1.905320841784298e-05,
374
- "loss": 1.0341,
375
  "step": 245
376
  },
377
  {
378
  "epoch": 0.23073373327180433,
379
- "grad_norm": 1.70806063018753,
380
- "learning_rate": 1.898353699482014e-05,
381
- "loss": 1.0335,
382
  "step": 250
383
  },
384
  {
385
  "epoch": 0.23534840793724043,
386
- "grad_norm": 1.6516040151613625,
387
- "learning_rate": 1.8911529104063093e-05,
388
- "loss": 1.0388,
389
  "step": 255
390
  },
391
  {
392
  "epoch": 0.23996308260267651,
393
- "grad_norm": 1.6090722840992302,
394
- "learning_rate": 1.8837203473619978e-05,
395
- "loss": 1.0423,
396
  "step": 260
397
  },
398
  {
399
  "epoch": 0.2445777572681126,
400
- "grad_norm": 1.7713270990326546,
401
- "learning_rate": 1.8760579434344283e-05,
402
- "loss": 1.0475,
403
  "step": 265
404
  },
405
  {
406
  "epoch": 0.24919243193354867,
407
- "grad_norm": 1.658315636443391,
408
- "learning_rate": 1.8681676914867176e-05,
409
- "loss": 1.0484,
410
  "step": 270
411
  },
412
  {
413
  "epoch": 0.25380710659898476,
414
- "grad_norm": 1.7916964421225479,
415
- "learning_rate": 1.860051643641443e-05,
416
- "loss": 1.0312,
417
  "step": 275
418
  },
419
  {
420
  "epoch": 0.25842178126442084,
421
- "grad_norm": 1.6115818403548052,
422
- "learning_rate": 1.8517119107469194e-05,
423
- "loss": 1.0406,
424
  "step": 280
425
  },
426
  {
427
  "epoch": 0.26303645592985697,
428
- "grad_norm": 1.5352737986672527,
429
- "learning_rate": 1.8431506618282e-05,
430
- "loss": 1.0308,
431
  "step": 285
432
  },
433
  {
434
  "epoch": 0.26765113059529305,
435
- "grad_norm": 1.6588566683503214,
436
- "learning_rate": 1.834370123522954e-05,
437
- "loss": 1.0427,
438
  "step": 290
439
  },
440
  {
441
  "epoch": 0.27226580526072913,
442
- "grad_norm": 1.5272038076819447,
443
- "learning_rate": 1.8253725795023504e-05,
444
- "loss": 1.0309,
445
  "step": 295
446
  },
447
  {
448
  "epoch": 0.2768804799261652,
449
- "grad_norm": 1.5715430487703328,
450
- "learning_rate": 1.816160369877117e-05,
451
- "loss": 1.037,
452
  "step": 300
453
  },
454
  {
455
  "epoch": 0.2768804799261652,
456
- "eval_loss": 1.0424165725708008,
457
- "eval_runtime": 645.7635,
458
- "eval_samples_per_second": 23.77,
459
- "eval_steps_per_second": 0.186,
460
  "step": 300
461
  },
462
  {
463
  "epoch": 0.2814951545916013,
464
- "grad_norm": 1.7174412533746373,
465
- "learning_rate": 1.8067358905889148e-05,
466
- "loss": 1.0107,
467
  "step": 305
468
  },
469
  {
470
  "epoch": 0.2861098292570374,
471
- "grad_norm": 1.6294029724129888,
472
- "learning_rate": 1.797101592787194e-05,
473
- "loss": 1.0333,
474
  "step": 310
475
  },
476
  {
477
  "epoch": 0.29072450392247345,
478
- "grad_norm": 1.5569394040938476,
479
- "learning_rate": 1.7872599821916922e-05,
480
- "loss": 1.0253,
481
  "step": 315
482
  },
483
  {
484
  "epoch": 0.29533917858790953,
485
- "grad_norm": 1.5576353282044078,
486
- "learning_rate": 1.7772136184407367e-05,
487
- "loss": 1.0258,
488
  "step": 320
489
  },
490
  {
491
  "epoch": 0.2999538532533456,
492
- "grad_norm": 1.6327760193096779,
493
- "learning_rate": 1.7669651144255265e-05,
494
- "loss": 1.0354,
495
  "step": 325
496
  },
497
  {
498
  "epoch": 0.30456852791878175,
499
- "grad_norm": 1.5902185774151916,
500
- "learning_rate": 1.7565171356105627e-05,
501
- "loss": 1.0473,
502
  "step": 330
503
  },
504
  {
505
  "epoch": 0.30918320258421783,
506
- "grad_norm": 1.6853713064054245,
507
- "learning_rate": 1.7458723993404065e-05,
508
- "loss": 1.0423,
509
  "step": 335
510
  },
511
  {
512
  "epoch": 0.3137978772496539,
513
- "grad_norm": 1.6156935982005598,
514
- "learning_rate": 1.7350336741329413e-05,
515
- "loss": 1.032,
516
  "step": 340
517
  },
518
  {
519
  "epoch": 0.31841255191509,
520
- "grad_norm": 1.6752591408238855,
521
- "learning_rate": 1.7240037789593307e-05,
522
- "loss": 1.0409,
523
  "step": 345
524
  },
525
  {
526
  "epoch": 0.3230272265805261,
527
- "grad_norm": 1.6216674961461026,
528
- "learning_rate": 1.712785582510848e-05,
529
- "loss": 1.0146,
530
  "step": 350
531
  },
532
  {
533
  "epoch": 0.32764190124596215,
534
- "grad_norm": 1.5384588472944032,
535
- "learning_rate": 1.70138200245278e-05,
536
- "loss": 1.0257,
537
  "step": 355
538
  },
539
  {
540
  "epoch": 0.33225657591139823,
541
- "grad_norm": 2.288338756325292,
542
- "learning_rate": 1.6897960046655886e-05,
543
- "loss": 1.033,
544
  "step": 360
545
  },
546
  {
547
  "epoch": 0.3368712505768343,
548
- "grad_norm": 1.7385161337142583,
549
- "learning_rate": 1.6780306024735384e-05,
550
- "loss": 1.0213,
551
  "step": 365
552
  },
553
  {
554
  "epoch": 0.3414859252422704,
555
- "grad_norm": 2.2001962662713885,
556
- "learning_rate": 1.6660888558609774e-05,
557
- "loss": 1.0451,
558
  "step": 370
559
  },
560
  {
561
  "epoch": 0.34610059990770653,
562
- "grad_norm": 1.5300219579349443,
563
- "learning_rate": 1.6539738706764895e-05,
564
- "loss": 1.0282,
565
  "step": 375
566
  },
567
  {
568
  "epoch": 0.3507152745731426,
569
- "grad_norm": 1.4444114105238783,
570
- "learning_rate": 1.6416887978251134e-05,
571
- "loss": 1.0203,
572
  "step": 380
573
  },
574
  {
575
  "epoch": 0.3553299492385787,
576
- "grad_norm": 1.5804824265139066,
577
- "learning_rate": 1.6292368324488462e-05,
578
- "loss": 1.0012,
579
  "step": 385
580
  },
581
  {
582
  "epoch": 0.35994462390401477,
583
- "grad_norm": 1.4610394973013912,
584
- "learning_rate": 1.6166212130956383e-05,
585
- "loss": 1.0116,
586
  "step": 390
587
  },
588
  {
589
  "epoch": 0.36455929856945085,
590
- "grad_norm": 1.5120140583271204,
591
- "learning_rate": 1.6038452208771037e-05,
592
- "loss": 1.0319,
593
  "step": 395
594
  },
595
  {
596
  "epoch": 0.36917397323488693,
597
- "grad_norm": 1.516281881967303,
598
- "learning_rate": 1.590912178615157e-05,
599
- "loss": 1.0136,
600
  "step": 400
601
  },
602
  {
603
  "epoch": 0.36917397323488693,
604
- "eval_loss": 1.0295383930206299,
605
- "eval_runtime": 651.8755,
606
- "eval_samples_per_second": 23.547,
607
- "eval_steps_per_second": 0.184,
608
  "step": 400
609
  },
610
  {
611
  "epoch": 0.373788647900323,
612
- "grad_norm": 1.7544165660209403,
613
- "learning_rate": 1.5778254499778006e-05,
614
- "loss": 1.0196,
615
  "step": 405
616
  },
617
  {
618
  "epoch": 0.3784033225657591,
619
- "grad_norm": 1.5692875836635374,
620
- "learning_rate": 1.564588438604296e-05,
621
- "loss": 1.0094,
622
  "step": 410
623
  },
624
  {
625
  "epoch": 0.3830179972311952,
626
- "grad_norm": 1.5588154190139185,
627
- "learning_rate": 1.551204587219928e-05,
628
- "loss": 0.9973,
629
  "step": 415
630
  },
631
  {
632
  "epoch": 0.3876326718966313,
633
- "grad_norm": 1.773673892309899,
634
- "learning_rate": 1.5376773767406142e-05,
635
- "loss": 1.0388,
636
  "step": 420
637
  },
638
  {
639
  "epoch": 0.3922473465620674,
640
- "grad_norm": 1.5489285794659653,
641
- "learning_rate": 1.5240103253675756e-05,
642
- "loss": 1.0087,
643
  "step": 425
644
  },
645
  {
646
  "epoch": 0.39686202122750347,
647
- "grad_norm": 1.6551129777825688,
648
- "learning_rate": 1.51020698767231e-05,
649
- "loss": 1.0164,
650
  "step": 430
651
  },
652
  {
653
  "epoch": 0.40147669589293955,
654
- "grad_norm": 1.4090672242535114,
655
- "learning_rate": 1.4962709536721087e-05,
656
- "loss": 0.997,
657
  "step": 435
658
  },
659
  {
660
  "epoch": 0.40609137055837563,
661
- "grad_norm": 1.5171777358410203,
662
- "learning_rate": 1.4822058478963532e-05,
663
- "loss": 1.0132,
664
  "step": 440
665
  },
666
  {
667
  "epoch": 0.4107060452238117,
668
- "grad_norm": 1.4969201768034885,
669
- "learning_rate": 1.4680153284438345e-05,
670
- "loss": 1.0119,
671
  "step": 445
672
  },
673
  {
674
  "epoch": 0.4153207198892478,
675
- "grad_norm": 1.5363202791746906,
676
- "learning_rate": 1.4537030860313443e-05,
677
- "loss": 1.0188,
678
  "step": 450
679
  },
680
  {
681
  "epoch": 0.41993539455468387,
682
- "grad_norm": 1.4963468746071473,
683
- "learning_rate": 1.4392728430337801e-05,
684
- "loss": 0.9952,
685
  "step": 455
686
  },
687
  {
688
  "epoch": 0.42455006922012,
689
- "grad_norm": 1.500810806740765,
690
- "learning_rate": 1.4247283525160178e-05,
691
- "loss": 0.9973,
692
  "step": 460
693
  },
694
  {
695
  "epoch": 0.4291647438855561,
696
- "grad_norm": 1.5508662694848825,
697
- "learning_rate": 1.4100733972568038e-05,
698
- "loss": 1.0085,
699
  "step": 465
700
  },
701
  {
702
  "epoch": 0.43377941855099217,
703
- "grad_norm": 1.5955354844051932,
704
- "learning_rate": 1.3953117887649153e-05,
705
- "loss": 1.0215,
706
  "step": 470
707
  },
708
  {
709
  "epoch": 0.43839409321642825,
710
- "grad_norm": 1.4682043182906732,
711
- "learning_rate": 1.3804473662878519e-05,
712
- "loss": 1.0143,
713
  "step": 475
714
  },
715
  {
716
  "epoch": 0.44300876788186433,
717
- "grad_norm": 1.4980307846950924,
718
- "learning_rate": 1.3654839958133118e-05,
719
- "loss": 1.0026,
720
  "step": 480
721
  },
722
  {
723
  "epoch": 0.4476234425473004,
724
- "grad_norm": 1.5233835818444807,
725
- "learning_rate": 1.3504255690637122e-05,
726
- "loss": 1.0205,
727
  "step": 485
728
  },
729
  {
730
  "epoch": 0.4522381172127365,
731
- "grad_norm": 1.4814525071349245,
732
- "learning_rate": 1.3352760024840174e-05,
733
- "loss": 0.9967,
734
  "step": 490
735
  },
736
  {
737
  "epoch": 0.45685279187817257,
738
- "grad_norm": 1.4499082430240968,
739
- "learning_rate": 1.3200392362231385e-05,
740
- "loss": 0.9842,
741
  "step": 495
742
  },
743
  {
744
  "epoch": 0.46146746654360865,
745
- "grad_norm": 1.5561735389313882,
746
- "learning_rate": 1.3047192331091636e-05,
747
- "loss": 1.0229,
748
  "step": 500
749
  },
750
  {
751
  "epoch": 0.46146746654360865,
752
- "eval_loss": 1.015141248703003,
753
- "eval_runtime": 633.8229,
754
- "eval_samples_per_second": 24.218,
755
- "eval_steps_per_second": 0.189,
756
  "step": 500
757
  },
758
  {
759
  "epoch": 0.4660821412090448,
760
- "grad_norm": 1.4233232765996602,
761
- "learning_rate": 1.2893199776186957e-05,
762
- "loss": 0.9936,
763
  "step": 505
764
  },
765
  {
766
  "epoch": 0.47069681587448087,
767
- "grad_norm": 1.54900822797248,
768
- "learning_rate": 1.2738454748405552e-05,
769
- "loss": 1.0102,
770
  "step": 510
771
  },
772
  {
773
  "epoch": 0.47531149053991695,
774
- "grad_norm": 1.4249354749013639,
775
- "learning_rate": 1.258299749434123e-05,
776
- "loss": 1.013,
777
  "step": 515
778
  },
779
  {
780
  "epoch": 0.47992616520535303,
781
- "grad_norm": 1.468956411146474,
782
- "learning_rate": 1.2426868445825955e-05,
783
- "loss": 1.0027,
784
  "step": 520
785
  },
786
  {
787
  "epoch": 0.4845408398707891,
788
- "grad_norm": 1.515134495058657,
789
- "learning_rate": 1.2270108209414186e-05,
790
- "loss": 0.9825,
791
  "step": 525
792
  },
793
  {
794
  "epoch": 0.4891555145362252,
795
- "grad_norm": 1.49493206284371,
796
- "learning_rate": 1.2112757555821796e-05,
797
- "loss": 0.9968,
798
  "step": 530
799
  },
800
  {
801
  "epoch": 0.49377018920166127,
802
- "grad_norm": 1.494232964423619,
803
- "learning_rate": 1.1954857409322302e-05,
804
- "loss": 0.9808,
805
  "step": 535
806
  },
807
  {
808
  "epoch": 0.49838486386709735,
809
- "grad_norm": 1.5895499778471747,
810
- "learning_rate": 1.179644883710313e-05,
811
- "loss": 0.996,
812
  "step": 540
813
  },
814
  {
815
  "epoch": 0.5029995385325334,
816
- "grad_norm": 1.575516689496947,
817
- "learning_rate": 1.1637573038584729e-05,
818
- "loss": 0.9843,
819
  "step": 545
820
  },
821
  {
822
  "epoch": 0.5076142131979695,
823
- "grad_norm": 1.5289310135121519,
824
- "learning_rate": 1.1478271334705302e-05,
825
- "loss": 0.9897,
826
  "step": 550
827
  },
828
  {
829
  "epoch": 0.5122288878634056,
830
- "grad_norm": 1.487892885517731,
831
- "learning_rate": 1.1318585157173913e-05,
832
- "loss": 0.9965,
833
  "step": 555
834
  },
835
  {
836
  "epoch": 0.5168435625288417,
837
- "grad_norm": 1.504695649448808,
838
- "learning_rate": 1.115855603769479e-05,
839
- "loss": 0.9864,
840
  "step": 560
841
  },
842
  {
843
  "epoch": 0.5214582371942778,
844
- "grad_norm": 1.444258657078223,
845
- "learning_rate": 1.0998225597165628e-05,
846
- "loss": 0.9824,
847
  "step": 565
848
  },
849
  {
850
  "epoch": 0.5260729118597139,
851
- "grad_norm": 1.452291205660523,
852
- "learning_rate": 1.0837635534852687e-05,
853
- "loss": 0.9806,
854
  "step": 570
855
  },
856
  {
857
  "epoch": 0.53068758652515,
858
- "grad_norm": 1.4809970617721466,
859
- "learning_rate": 1.0676827617545511e-05,
860
- "loss": 0.98,
861
  "step": 575
862
  },
863
  {
864
  "epoch": 0.5353022611905861,
865
- "grad_norm": 1.4688234901022226,
866
- "learning_rate": 1.0515843668694087e-05,
867
- "loss": 0.9785,
868
  "step": 580
869
  },
870
  {
871
  "epoch": 0.5399169358560222,
872
- "grad_norm": 1.4825659064745627,
873
- "learning_rate": 1.0354725557531258e-05,
874
- "loss": 0.9776,
875
  "step": 585
876
  },
877
  {
878
  "epoch": 0.5445316105214583,
879
- "grad_norm": 1.3801777122885093,
880
- "learning_rate": 1.0193515188183246e-05,
881
- "loss": 0.9687,
882
  "step": 590
883
  },
884
  {
885
  "epoch": 0.5491462851868943,
886
- "grad_norm": 1.421023225061784,
887
- "learning_rate": 1.003225448877108e-05,
888
- "loss": 0.9964,
889
  "step": 595
890
  },
891
  {
892
  "epoch": 0.5537609598523304,
893
- "grad_norm": 1.3889284539657671,
894
- "learning_rate": 9.870985400505805e-06,
895
- "loss": 0.9745,
896
  "step": 600
897
  },
898
  {
899
  "epoch": 0.5537609598523304,
900
- "eval_loss": 0.9945215582847595,
901
- "eval_runtime": 661.3891,
902
- "eval_samples_per_second": 23.209,
903
- "eval_steps_per_second": 0.181,
904
  "step": 600
905
  },
906
  {
907
  "epoch": 0.5583756345177665,
908
- "grad_norm": 1.4429569586116144,
909
- "learning_rate": 9.709749866780248e-06,
910
- "loss": 0.9805,
911
  "step": 605
912
  },
913
  {
914
  "epoch": 0.5629903091832026,
915
- "grad_norm": 1.4656449742761994,
916
- "learning_rate": 9.548589822260281e-06,
917
- "loss": 0.9895,
918
  "step": 610
919
  },
920
  {
921
  "epoch": 0.5676049838486387,
922
- "grad_norm": 1.3965932035586004,
923
- "learning_rate": 9.387547181978291e-06,
924
- "loss": 0.9744,
925
  "step": 615
926
  },
927
  {
928
  "epoch": 0.5722196585140747,
929
- "grad_norm": 1.359374657149616,
930
- "learning_rate": 9.226663830431777e-06,
931
- "loss": 0.9824,
932
  "step": 620
933
  },
934
  {
935
  "epoch": 0.5768343331795108,
936
- "grad_norm": 1.3668229629199753,
937
- "learning_rate": 9.065981610689915e-06,
938
- "loss": 0.9706,
939
  "step": 625
940
  },
941
  {
942
  "epoch": 0.5814490078449469,
943
- "grad_norm": 1.373500531171451,
944
- "learning_rate": 8.905542313510846e-06,
945
- "loss": 0.9796,
946
  "step": 630
947
  },
948
  {
949
  "epoch": 0.586063682510383,
950
- "grad_norm": 1.4067124446675243,
951
- "learning_rate": 8.745387666472639e-06,
952
- "loss": 0.9879,
953
  "step": 635
954
  },
955
  {
956
  "epoch": 0.5906783571758191,
957
- "grad_norm": 1.436245514521079,
958
- "learning_rate": 8.58555932312059e-06,
959
- "loss": 0.9894,
960
  "step": 640
961
  },
962
  {
963
  "epoch": 0.5952930318412551,
964
- "grad_norm": 1.429504715827128,
965
- "learning_rate": 8.426098852133892e-06,
966
- "loss": 0.9643,
967
  "step": 645
968
  },
969
  {
970
  "epoch": 0.5999077065066912,
971
- "grad_norm": 1.3728127558164411,
972
- "learning_rate": 8.267047726514278e-06,
973
- "loss": 0.9813,
974
  "step": 650
975
  },
976
  {
977
  "epoch": 0.6045223811721273,
978
- "grad_norm": 1.3422366968784711,
979
- "learning_rate": 8.108447312799588e-06,
980
- "loss": 0.972,
981
  "step": 655
982
  },
983
  {
984
  "epoch": 0.6091370558375635,
985
- "grad_norm": 1.4348417465202754,
986
- "learning_rate": 7.950338860305049e-06,
987
- "loss": 0.9638,
988
  "step": 660
989
  },
990
  {
991
  "epoch": 0.6137517305029996,
992
- "grad_norm": 1.3342023162033965,
993
- "learning_rate": 7.792763490394983e-06,
994
- "loss": 0.9733,
995
  "step": 665
996
  },
997
  {
998
  "epoch": 0.6183664051684357,
999
- "grad_norm": 1.361475388045652,
1000
- "learning_rate": 7.635762185787868e-06,
1001
- "loss": 0.9773,
1002
  "step": 670
1003
  },
1004
  {
1005
  "epoch": 0.6229810798338717,
1006
- "grad_norm": 1.3634924688905254,
1007
- "learning_rate": 7.479375779897379e-06,
1008
- "loss": 0.9747,
1009
  "step": 675
1010
  },
1011
  {
1012
  "epoch": 0.6275957544993078,
1013
- "grad_norm": 1.3732265984949414,
1014
- "learning_rate": 7.3236449462123315e-06,
1015
- "loss": 0.9678,
1016
  "step": 680
1017
  },
1018
  {
1019
  "epoch": 0.6322104291647439,
1020
- "grad_norm": 1.4464461120602612,
1021
- "learning_rate": 7.168610187718164e-06,
1022
- "loss": 0.9662,
1023
  "step": 685
1024
  },
1025
  {
1026
  "epoch": 0.63682510383018,
1027
- "grad_norm": 1.3931117990795983,
1028
- "learning_rate": 7.014311826362804e-06,
1029
- "loss": 0.9641,
1030
  "step": 690
1031
  },
1032
  {
1033
  "epoch": 0.6414397784956161,
1034
- "grad_norm": 1.366546097704984,
1035
- "learning_rate": 6.860789992569601e-06,
1036
- "loss": 0.9787,
1037
  "step": 695
1038
  },
1039
  {
1040
  "epoch": 0.6460544531610521,
1041
- "grad_norm": 1.3945778923545584,
1042
- "learning_rate": 6.708084614800065e-06,
1043
- "loss": 0.9441,
1044
  "step": 700
1045
  },
1046
  {
1047
  "epoch": 0.6460544531610521,
1048
- "eval_loss": 0.9769104719161987,
1049
- "eval_runtime": 633.9092,
1050
- "eval_samples_per_second": 24.215,
1051
- "eval_steps_per_second": 0.189,
1052
  "step": 700
1053
  },
1054
  {
1055
  "epoch": 0.6506691278264882,
1056
- "grad_norm": 1.364918482537208,
1057
- "learning_rate": 6.556235409169154e-06,
1058
- "loss": 0.9437,
1059
  "step": 705
1060
  },
1061
  {
1062
  "epoch": 0.6552838024919243,
1063
- "grad_norm": 1.3330832614943129,
1064
- "learning_rate": 6.405281869115768e-06,
1065
- "loss": 0.9482,
1066
  "step": 710
1067
  },
1068
  {
1069
  "epoch": 0.6598984771573604,
1070
- "grad_norm": 1.3291401175998692,
1071
- "learning_rate": 6.255263255131172e-06,
1072
- "loss": 0.9646,
1073
  "step": 715
1074
  },
1075
  {
1076
  "epoch": 0.6645131518227965,
1077
- "grad_norm": 1.3661394031338707,
1078
- "learning_rate": 6.106218584547992e-06,
1079
- "loss": 0.9649,
1080
  "step": 720
1081
  },
1082
  {
1083
  "epoch": 0.6691278264882325,
1084
- "grad_norm": 1.3117340443959773,
1085
- "learning_rate": 5.9581866213924656e-06,
1086
- "loss": 0.9525,
1087
  "step": 725
1088
  },
1089
  {
1090
  "epoch": 0.6737425011536686,
1091
- "grad_norm": 1.364658394013176,
1092
- "learning_rate": 5.811205866302571e-06,
1093
- "loss": 0.9516,
1094
  "step": 730
1095
  },
1096
  {
1097
  "epoch": 0.6783571758191047,
1098
- "grad_norm": 1.331994492768848,
1099
- "learning_rate": 5.665314546514633e-06,
1100
- "loss": 0.954,
1101
  "step": 735
1102
  },
1103
  {
1104
  "epoch": 0.6829718504845408,
1105
- "grad_norm": 1.3743467262940992,
1106
- "learning_rate": 5.520550605921091e-06,
1107
- "loss": 0.9554,
1108
  "step": 740
1109
  },
1110
  {
1111
  "epoch": 0.687586525149977,
1112
- "grad_norm": 1.3312291076208118,
1113
- "learning_rate": 5.376951695201894e-06,
1114
- "loss": 0.9565,
1115
  "step": 745
1116
  },
1117
  {
1118
  "epoch": 0.6922011998154131,
1119
- "grad_norm": 1.3832998972367352,
1120
- "learning_rate": 5.234555162032221e-06,
1121
- "loss": 0.9475,
1122
  "step": 750
1123
  },
1124
  {
1125
  "epoch": 0.6968158744808491,
1126
- "grad_norm": 1.389562727942595,
1127
- "learning_rate": 5.093398041368942e-06,
1128
- "loss": 0.9574,
1129
  "step": 755
1130
  },
1131
  {
1132
  "epoch": 0.7014305491462852,
1133
- "grad_norm": 1.3714452844531986,
1134
- "learning_rate": 4.9535170458184735e-06,
1135
- "loss": 0.9581,
1136
  "step": 760
1137
  },
1138
  {
1139
  "epoch": 0.7060452238117213,
1140
- "grad_norm": 1.3477889477630838,
1141
- "learning_rate": 4.81494855608843e-06,
1142
- "loss": 0.9561,
1143
  "step": 765
1144
  },
1145
  {
1146
  "epoch": 0.7106598984771574,
1147
- "grad_norm": 1.4491931180376743,
1148
- "learning_rate": 4.677728611525605e-06,
1149
- "loss": 0.9512,
1150
  "step": 770
1151
  },
1152
  {
1153
  "epoch": 0.7152745731425935,
1154
- "grad_norm": 1.3241497550464327,
1155
- "learning_rate": 4.541892900742757e-06,
1156
- "loss": 0.9422,
1157
  "step": 775
1158
  },
1159
  {
1160
  "epoch": 0.7198892478080295,
1161
- "grad_norm": 1.314421280157553,
1162
- "learning_rate": 4.407476752336575e-06,
1163
- "loss": 0.943,
1164
  "step": 780
1165
  },
1166
  {
1167
  "epoch": 0.7245039224734656,
1168
- "grad_norm": 1.2755970945876594,
1169
- "learning_rate": 4.2745151256993325e-06,
1170
- "loss": 0.9426,
1171
  "step": 785
1172
  },
1173
  {
1174
  "epoch": 0.7291185971389017,
1175
- "grad_norm": 1.332124542587031,
1176
- "learning_rate": 4.143042601926492e-06,
1177
- "loss": 0.9533,
1178
  "step": 790
1179
  },
1180
  {
1181
  "epoch": 0.7337332718043378,
1182
- "grad_norm": 1.3708413423330084,
1183
- "learning_rate": 4.013093374822789e-06,
1184
- "loss": 0.9374,
1185
  "step": 795
1186
  },
1187
  {
1188
  "epoch": 0.7383479464697739,
1189
- "grad_norm": 1.27203160584856,
1190
- "learning_rate": 3.884701242008949e-06,
1191
- "loss": 0.9277,
1192
  "step": 800
1193
  },
1194
  {
1195
  "epoch": 0.7383479464697739,
1196
- "eval_loss": 0.9612703323364258,
1197
- "eval_runtime": 651.9955,
1198
- "eval_samples_per_second": 23.543,
1199
- "eval_steps_per_second": 0.184,
1200
  "step": 800
1201
  },
1202
  {
1203
  "epoch": 0.7429626211352099,
1204
- "grad_norm": 1.31377359076841,
1205
- "learning_rate": 3.757899596131529e-06,
1206
- "loss": 0.9611,
1207
  "step": 805
1208
  },
1209
  {
1210
  "epoch": 0.747577295800646,
1211
- "grad_norm": 1.2975998004112579,
1212
- "learning_rate": 3.6327214161780287e-06,
1213
- "loss": 0.9798,
1214
  "step": 810
1215
  },
1216
  {
1217
  "epoch": 0.7521919704660821,
1218
- "grad_norm": 1.3894464184722144,
1219
- "learning_rate": 3.5091992588996026e-06,
1220
- "loss": 0.9567,
1221
  "step": 815
1222
  },
1223
  {
1224
  "epoch": 0.7568066451315182,
1225
- "grad_norm": 1.282473554343769,
1226
- "learning_rate": 3.387365250343615e-06,
1227
- "loss": 0.954,
1228
  "step": 820
1229
  },
1230
  {
1231
  "epoch": 0.7614213197969543,
1232
- "grad_norm": 1.3647805901178591,
1233
- "learning_rate": 3.2672510774981692e-06,
1234
- "loss": 0.9361,
1235
  "step": 825
1236
  },
1237
  {
1238
  "epoch": 0.7660359944623903,
1239
- "grad_norm": 1.347888877348781,
1240
- "learning_rate": 3.148887980050872e-06,
1241
- "loss": 0.9432,
1242
  "step": 830
1243
  },
1244
  {
1245
  "epoch": 0.7706506691278265,
1246
- "grad_norm": 1.377090549096584,
1247
- "learning_rate": 3.032306742263891e-06,
1248
- "loss": 0.9519,
1249
  "step": 835
1250
  },
1251
  {
1252
  "epoch": 0.7752653437932626,
1253
- "grad_norm": 1.287471382339416,
1254
- "learning_rate": 2.9175376849675076e-06,
1255
- "loss": 0.9607,
1256
  "step": 840
1257
  },
1258
  {
1259
  "epoch": 0.7798800184586987,
1260
- "grad_norm": 1.4070979616878627,
1261
- "learning_rate": 2.8046106576741605e-06,
1262
- "loss": 0.929,
1263
  "step": 845
1264
  },
1265
  {
1266
  "epoch": 0.7844946931241348,
1267
- "grad_norm": 1.296812382608952,
1268
- "learning_rate": 2.693555030815085e-06,
1269
- "loss": 0.9383,
1270
  "step": 850
1271
  },
1272
  {
1273
  "epoch": 0.7891093677895709,
1274
- "grad_norm": 1.345957217872087,
1275
- "learning_rate": 2.5843996881015676e-06,
1276
- "loss": 0.9378,
1277
  "step": 855
1278
  },
1279
  {
1280
  "epoch": 0.7937240424550069,
1281
- "grad_norm": 1.3218142306826084,
1282
- "learning_rate": 2.4771730190127616e-06,
1283
- "loss": 0.9353,
1284
  "step": 860
1285
  },
1286
  {
1287
  "epoch": 0.798338717120443,
1288
- "grad_norm": 1.2932161650654428,
1289
- "learning_rate": 2.3719029114120716e-06,
1290
- "loss": 0.9333,
1291
  "step": 865
1292
  },
1293
  {
1294
  "epoch": 0.8029533917858791,
1295
- "grad_norm": 1.36407173107211,
1296
- "learning_rate": 2.2686167442939733e-06,
1297
- "loss": 0.9401,
1298
  "step": 870
1299
  },
1300
  {
1301
  "epoch": 0.8075680664513152,
1302
- "grad_norm": 1.302064737279862,
1303
- "learning_rate": 2.1673413806632104e-06,
1304
- "loss": 0.939,
1305
  "step": 875
1306
  },
1307
  {
1308
  "epoch": 0.8121827411167513,
1309
- "grad_norm": 1.3229653318762729,
1310
- "learning_rate": 2.0681031605481563e-06,
1311
- "loss": 0.9355,
1312
  "step": 880
1313
  },
1314
  {
1315
  "epoch": 0.8167974157821873,
1316
- "grad_norm": 1.3252223231469167,
1317
- "learning_rate": 1.9709278941502363e-06,
1318
- "loss": 0.9344,
1319
  "step": 885
1320
  },
1321
  {
1322
  "epoch": 0.8214120904476234,
1323
- "grad_norm": 1.3270338774644677,
1324
- "learning_rate": 1.8758408551311048e-06,
1325
- "loss": 0.9321,
1326
  "step": 890
1327
  },
1328
  {
1329
  "epoch": 0.8260267651130595,
1330
- "grad_norm": 1.3116552675081934,
1331
- "learning_rate": 1.7828667740394045e-06,
1332
- "loss": 0.9513,
1333
  "step": 895
1334
  },
1335
  {
1336
  "epoch": 0.8306414397784956,
1337
- "grad_norm": 1.2617601634166857,
1338
- "learning_rate": 1.6920298318787532e-06,
1339
- "loss": 0.9384,
1340
  "step": 900
1341
  },
1342
  {
1343
  "epoch": 0.8306414397784956,
1344
- "eval_loss": 0.9501336812973022,
1345
- "eval_runtime": 633.7004,
1346
- "eval_samples_per_second": 24.223,
1347
- "eval_steps_per_second": 0.189,
1348
  "step": 900
1349
  },
1350
  {
1351
  "epoch": 0.8352561144439317,
1352
- "grad_norm": 1.3036810903046379,
1353
- "learning_rate": 1.6033536538186778e-06,
1354
- "loss": 0.9363,
1355
  "step": 905
1356
  },
1357
  {
1358
  "epoch": 0.8398707891093677,
1359
- "grad_norm": 1.3184782817538288,
1360
- "learning_rate": 1.5168613030500922e-06,
1361
- "loss": 0.9254,
1362
  "step": 910
1363
  },
1364
  {
1365
  "epoch": 0.8444854637748038,
1366
- "grad_norm": 1.3311174144545366,
1367
- "learning_rate": 1.4325752747869626e-06,
1368
- "loss": 0.9401,
1369
  "step": 915
1370
  },
1371
  {
1372
  "epoch": 0.84910013844024,
1373
- "grad_norm": 1.2485403971692124,
1374
- "learning_rate": 1.3505174904156593e-06,
1375
- "loss": 0.9305,
1376
  "step": 920
1377
  },
1378
  {
1379
  "epoch": 0.8537148131056761,
1380
- "grad_norm": 1.302635001110673,
1381
- "learning_rate": 1.2707092917935914e-06,
1382
- "loss": 0.9393,
1383
  "step": 925
1384
  },
1385
  {
1386
  "epoch": 0.8583294877711122,
1387
- "grad_norm": 1.3633354249041523,
1388
- "learning_rate": 1.1931714356985257e-06,
1389
- "loss": 0.9312,
1390
  "step": 930
1391
  },
1392
  {
1393
  "epoch": 0.8629441624365483,
1394
- "grad_norm": 1.2815769915508204,
1395
- "learning_rate": 1.1179240884301158e-06,
1396
- "loss": 0.9217,
1397
  "step": 935
1398
  },
1399
  {
1400
  "epoch": 0.8675588371019843,
1401
- "grad_norm": 1.3419818473322924,
1402
- "learning_rate": 1.0449868205649648e-06,
1403
- "loss": 0.9168,
1404
  "step": 940
1405
  },
1406
  {
1407
  "epoch": 0.8721735117674204,
1408
- "grad_norm": 1.3006753146842553,
1409
- "learning_rate": 9.74378601866669e-07,
1410
- "loss": 0.9413,
1411
  "step": 945
1412
  },
1413
  {
1414
  "epoch": 0.8767881864328565,
1415
- "grad_norm": 1.275285958470618,
1416
- "learning_rate": 9.061177963520751e-07,
1417
- "loss": 0.9446,
1418
  "step": 950
1419
  },
1420
  {
1421
  "epoch": 0.8814028610982926,
1422
- "grad_norm": 1.2680303522716787,
1423
- "learning_rate": 8.402221575151238e-07,
1424
- "loss": 0.9161,
1425
  "step": 955
1426
  },
1427
  {
1428
  "epoch": 0.8860175357637287,
1429
- "grad_norm": 1.2495043326934117,
1430
- "learning_rate": 7.767088237094578e-07,
1431
- "loss": 0.9333,
1432
  "step": 960
1433
  },
1434
  {
1435
  "epoch": 0.8906322104291647,
1436
- "grad_norm": 1.3345940160548069,
1437
- "learning_rate": 7.155943136910193e-07,
1438
- "loss": 0.9353,
1439
  "step": 965
1440
  },
1441
  {
1442
  "epoch": 0.8952468850946008,
1443
- "grad_norm": 1.3252439901574087,
1444
- "learning_rate": 6.568945223218048e-07,
1445
- "loss": 0.9381,
1446
  "step": 970
1447
  },
1448
  {
1449
  "epoch": 0.8998615597600369,
1450
- "grad_norm": 1.2795828639710098,
1451
- "learning_rate": 6.00624716435868e-07,
1452
- "loss": 0.9199,
1453
  "step": 975
1454
  },
1455
  {
1456
  "epoch": 0.904476234425473,
1457
- "grad_norm": 1.3413550832303935,
1458
- "learning_rate": 5.467995308686813e-07,
1459
- "loss": 0.94,
1460
  "step": 980
1461
  },
1462
  {
1463
  "epoch": 0.9090909090909091,
1464
- "grad_norm": 1.3436022079621268,
1465
- "learning_rate": 4.954329646508505e-07,
1466
- "loss": 0.9313,
1467
  "step": 985
1468
  },
1469
  {
1470
  "epoch": 0.9137055837563451,
1471
- "grad_norm": 1.3709638383836422,
1472
- "learning_rate": 4.4653837736721273e-07,
1473
- "loss": 0.9346,
1474
  "step": 990
1475
  },
1476
  {
1477
  "epoch": 0.9183202584217812,
1478
- "grad_norm": 1.2668090185362362,
1479
- "learning_rate": 4.001284856822174e-07,
1480
- "loss": 0.9408,
1481
  "step": 995
1482
  },
1483
  {
1484
  "epoch": 0.9229349330872173,
1485
- "grad_norm": 1.2697339769613498,
1486
- "learning_rate": 3.562153600325491e-07,
1487
- "loss": 0.9216,
1488
  "step": 1000
1489
  },
1490
  {
1491
  "epoch": 0.9229349330872173,
1492
- "eval_loss": 0.9448357224464417,
1493
- "eval_runtime": 634.4549,
1494
- "eval_samples_per_second": 24.194,
1495
- "eval_steps_per_second": 0.189,
1496
  "step": 1000
1497
  },
1498
  {
1499
  "epoch": 0.9275496077526535,
1500
- "grad_norm": 1.281655332093928,
1501
- "learning_rate": 3.1481042148779674e-07,
1502
- "loss": 0.9399,
1503
  "step": 1005
1504
  },
1505
  {
1506
  "epoch": 0.9321642824180896,
1507
- "grad_norm": 1.3600956214897377,
1508
- "learning_rate": 2.7592443878003196e-07,
1509
- "loss": 0.9408,
1510
  "step": 1010
1511
  },
1512
  {
1513
  "epoch": 0.9367789570835257,
1514
- "grad_norm": 1.3158687412306498,
1515
- "learning_rate": 2.395675255030383e-07,
1516
- "loss": 0.9227,
1517
  "step": 1015
1518
  },
1519
  {
1520
  "epoch": 0.9413936317489617,
1521
- "grad_norm": 1.321042213237677,
1522
- "learning_rate": 2.057491374819365e-07,
1523
- "loss": 0.9251,
1524
  "step": 1020
1525
  },
1526
  {
1527
  "epoch": 0.9460083064143978,
1528
- "grad_norm": 1.2706676798657595,
1529
- "learning_rate": 1.7447807031388264e-07,
1530
- "loss": 0.9332,
1531
  "step": 1025
1532
  },
1533
  {
1534
  "epoch": 0.9506229810798339,
1535
- "grad_norm": 1.3025080688365438,
1536
- "learning_rate": 1.457624570804772e-07,
1537
- "loss": 0.9171,
1538
  "step": 1030
1539
  },
1540
  {
1541
  "epoch": 0.95523765574527,
1542
- "grad_norm": 1.3183940856356338,
1543
- "learning_rate": 1.196097662324902e-07,
1544
- "loss": 0.9394,
1545
  "step": 1035
1546
  },
1547
  {
1548
  "epoch": 0.9598523304107061,
1549
- "grad_norm": 1.2959502551091049,
1550
- "learning_rate": 9.602679964744288e-08,
1551
- "loss": 0.9171,
1552
  "step": 1040
1553
  },
1554
  {
1555
  "epoch": 0.9644670050761421,
1556
- "grad_norm": 1.3183746243018815,
1557
- "learning_rate": 7.501969086054717e-08,
1558
- "loss": 0.9328,
1559
  "step": 1045
1560
  },
1561
  {
1562
  "epoch": 0.9690816797415782,
1563
- "grad_norm": 1.3307943534831945,
1564
- "learning_rate": 5.659390346948179e-08,
1565
- "loss": 0.9424,
1566
  "step": 1050
1567
  },
1568
  {
1569
  "epoch": 0.9736963544070143,
1570
- "grad_norm": 1.2730525780758684,
1571
- "learning_rate": 4.075422971340115e-08,
1572
- "loss": 0.9402,
1573
  "step": 1055
1574
  },
1575
  {
1576
  "epoch": 0.9783110290724504,
1577
- "grad_norm": 1.3093625889696177,
1578
- "learning_rate": 2.7504789226548977e-08,
1579
- "loss": 0.9275,
1580
  "step": 1060
1581
  },
1582
  {
1583
  "epoch": 0.9829257037378865,
1584
- "grad_norm": 1.2514296608879,
1585
- "learning_rate": 1.6849027966816535e-08,
1586
- "loss": 0.9269,
1587
  "step": 1065
1588
  },
1589
  {
1590
  "epoch": 0.9875403784033225,
1591
- "grad_norm": 1.273379135333167,
1592
- "learning_rate": 8.789717319505065e-09,
1593
- "loss": 0.9362,
1594
  "step": 1070
1595
  },
1596
  {
1597
  "epoch": 0.9921550530687586,
1598
- "grad_norm": 1.2811332538414983,
1599
- "learning_rate": 3.328953376530164e-09,
1600
- "loss": 0.9313,
1601
  "step": 1075
1602
  },
1603
  {
1604
  "epoch": 0.9967697277341947,
1605
- "grad_norm": 1.3117234277097758,
1606
- "learning_rate": 4.681563912700693e-10,
1607
- "loss": 0.9204,
1608
  "step": 1080
1609
  },
1610
  {
1611
  "epoch": 0.9995385325334564,
1612
  "step": 1083,
1613
  "total_flos": 453306954547200.0,
1614
- "train_loss": 0.986508995762382,
1615
- "train_runtime": 33955.1767,
1616
- "train_samples_per_second": 4.084,
1617
- "train_steps_per_second": 0.032
1618
  }
1619
  ],
1620
  "logging_steps": 5,
 
10
  "log_history": [
11
  {
12
  "epoch": 0.0009229349330872173,
13
+ "grad_norm": 9.222650171740101,
14
+ "learning_rate": 9.174311926605506e-08,
15
+ "loss": 1.1391,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.0046146746654360865,
20
+ "grad_norm": 8.813354760736841,
21
+ "learning_rate": 4.587155963302753e-07,
22
+ "loss": 1.1346,
23
  "step": 5
24
  },
25
  {
26
  "epoch": 0.009229349330872173,
27
+ "grad_norm": 5.082254385960609,
28
+ "learning_rate": 9.174311926605506e-07,
29
+ "loss": 1.0934,
30
  "step": 10
31
  },
32
  {
33
  "epoch": 0.01384402399630826,
34
+ "grad_norm": 3.211536911547781,
35
+ "learning_rate": 1.3761467889908258e-06,
36
+ "loss": 1.0264,
37
  "step": 15
38
  },
39
  {
40
  "epoch": 0.018458698661744346,
41
+ "grad_norm": 2.7181445543779494,
42
+ "learning_rate": 1.8348623853211011e-06,
43
+ "loss": 1.0199,
44
  "step": 20
45
  },
46
  {
47
  "epoch": 0.023073373327180433,
48
+ "grad_norm": 2.3056635728248183,
49
+ "learning_rate": 2.2935779816513764e-06,
50
+ "loss": 1.0075,
51
  "step": 25
52
  },
53
  {
54
  "epoch": 0.02768804799261652,
55
+ "grad_norm": 2.6315302800596365,
56
+ "learning_rate": 2.7522935779816517e-06,
57
+ "loss": 0.9732,
58
  "step": 30
59
  },
60
  {
61
  "epoch": 0.032302722658052604,
62
+ "grad_norm": 2.0678619803720886,
63
+ "learning_rate": 3.211009174311927e-06,
64
+ "loss": 0.9964,
65
  "step": 35
66
  },
67
  {
68
  "epoch": 0.03691739732348869,
69
+ "grad_norm": 3.005634145043816,
70
+ "learning_rate": 3.6697247706422022e-06,
71
+ "loss": 1.0015,
72
  "step": 40
73
  },
74
  {
75
  "epoch": 0.04153207198892478,
76
+ "grad_norm": 2.495857277480277,
77
+ "learning_rate": 4.128440366972478e-06,
78
+ "loss": 0.979,
79
  "step": 45
80
  },
81
  {
82
  "epoch": 0.046146746654360866,
83
+ "grad_norm": 2.442092473284365,
84
+ "learning_rate": 4.587155963302753e-06,
85
+ "loss": 0.9723,
86
  "step": 50
87
  },
88
  {
89
  "epoch": 0.050761421319796954,
90
+ "grad_norm": 2.1284759016567136,
91
+ "learning_rate": 5.045871559633028e-06,
92
+ "loss": 0.9722,
93
  "step": 55
94
  },
95
  {
96
  "epoch": 0.05537609598523304,
97
+ "grad_norm": 2.83564108049901,
98
+ "learning_rate": 5.504587155963303e-06,
99
+ "loss": 0.9852,
100
  "step": 60
101
  },
102
  {
103
  "epoch": 0.05999077065066913,
104
+ "grad_norm": 2.0748240090940975,
105
+ "learning_rate": 5.963302752293578e-06,
106
+ "loss": 0.9845,
107
  "step": 65
108
  },
109
  {
110
  "epoch": 0.06460544531610521,
111
+ "grad_norm": 2.9709334092183863,
112
+ "learning_rate": 6.422018348623854e-06,
113
+ "loss": 0.9615,
114
  "step": 70
115
  },
116
  {
117
  "epoch": 0.0692201199815413,
118
+ "grad_norm": 2.184920669369361,
119
+ "learning_rate": 6.880733944954129e-06,
120
+ "loss": 0.9675,
121
  "step": 75
122
  },
123
  {
124
  "epoch": 0.07383479464697738,
125
+ "grad_norm": 2.1738804546566817,
126
+ "learning_rate": 7.3394495412844045e-06,
127
+ "loss": 0.9589,
128
  "step": 80
129
  },
130
  {
131
  "epoch": 0.07844946931241348,
132
+ "grad_norm": 2.223214536084122,
133
+ "learning_rate": 7.79816513761468e-06,
134
+ "loss": 0.977,
135
  "step": 85
136
  },
137
  {
138
  "epoch": 0.08306414397784956,
139
+ "grad_norm": 1.990814551883062,
140
+ "learning_rate": 8.256880733944956e-06,
141
+ "loss": 0.9767,
142
  "step": 90
143
  },
144
  {
145
  "epoch": 0.08767881864328565,
146
+ "grad_norm": 2.0700255239398375,
147
+ "learning_rate": 8.71559633027523e-06,
148
+ "loss": 0.9828,
149
  "step": 95
150
  },
151
  {
152
  "epoch": 0.09229349330872173,
153
+ "grad_norm": 1.821385794436734,
154
+ "learning_rate": 9.174311926605506e-06,
155
+ "loss": 0.9896,
156
  "step": 100
157
  },
158
  {
159
  "epoch": 0.09229349330872173,
160
+ "eval_loss": 0.9892959594726562,
161
+ "eval_runtime": 901.1864,
162
+ "eval_samples_per_second": 17.033,
163
+ "eval_steps_per_second": 0.133,
164
  "step": 100
165
  },
166
  {
167
  "epoch": 0.09690816797415783,
168
+ "grad_norm": 2.1363964627484155,
169
+ "learning_rate": 9.633027522935781e-06,
170
+ "loss": 1.015,
171
  "step": 105
172
  },
173
  {
174
  "epoch": 0.10152284263959391,
175
+ "grad_norm": 1.9477999761197564,
176
+ "learning_rate": 9.999973991131207e-06,
177
+ "loss": 0.9983,
178
  "step": 110
179
  },
180
  {
181
  "epoch": 0.10613751730503,
182
+ "grad_norm": 2.0399110459564516,
183
+ "learning_rate": 9.999063709134502e-06,
184
+ "loss": 1.0062,
185
  "step": 115
186
  },
187
  {
188
  "epoch": 0.11075219197046608,
189
+ "grad_norm": 1.9098211009145907,
190
+ "learning_rate": 9.996853254269984e-06,
191
+ "loss": 0.9855,
192
  "step": 120
193
  },
194
  {
195
  "epoch": 0.11536686663590216,
196
+ "grad_norm": 1.7833107957442789,
197
+ "learning_rate": 9.99334320144e-06,
198
+ "loss": 1.0093,
199
  "step": 125
200
  },
201
  {
202
  "epoch": 0.11998154130133826,
203
+ "grad_norm": 1.7815777364460414,
204
+ "learning_rate": 9.988534463550585e-06,
205
+ "loss": 0.981,
206
  "step": 130
207
  },
208
  {
209
  "epoch": 0.12459621596677434,
210
+ "grad_norm": 1.8694974323828528,
211
+ "learning_rate": 9.982428291274047e-06,
212
+ "loss": 1.0102,
213
  "step": 135
214
  },
215
  {
216
  "epoch": 0.12921089063221042,
217
+ "grad_norm": 1.8237445983099982,
218
+ "learning_rate": 9.975026272723677e-06,
219
+ "loss": 1.006,
220
  "step": 140
221
  },
222
  {
223
  "epoch": 0.13382556529764653,
224
+ "grad_norm": 1.8625032130850405,
225
+ "learning_rate": 9.96633033304071e-06,
226
+ "loss": 0.984,
227
  "step": 145
228
  },
229
  {
230
  "epoch": 0.1384402399630826,
231
+ "grad_norm": 2.072572452539365,
232
+ "learning_rate": 9.95634273389363e-06,
233
+ "loss": 0.9712,
234
  "step": 150
235
  },
236
  {
237
  "epoch": 0.1430549146285187,
238
+ "grad_norm": 2.0593175620343547,
239
+ "learning_rate": 9.945066072889942e-06,
240
+ "loss": 1.0211,
241
  "step": 155
242
  },
243
  {
244
  "epoch": 0.14766958929395477,
245
+ "grad_norm": 2.0200152592170943,
246
+ "learning_rate": 9.93250328290059e-06,
247
+ "loss": 0.9775,
248
  "step": 160
249
  },
250
  {
251
  "epoch": 0.15228426395939088,
252
+ "grad_norm": 1.875921122063454,
253
+ "learning_rate": 9.918657631297153e-06,
254
+ "loss": 0.9847,
255
  "step": 165
256
  },
257
  {
258
  "epoch": 0.15689893862482696,
259
+ "grad_norm": 1.8527340279543534,
260
+ "learning_rate": 9.90353271910206e-06,
261
+ "loss": 0.9872,
262
  "step": 170
263
  },
264
  {
265
  "epoch": 0.16151361329026304,
266
+ "grad_norm": 1.8908823561795056,
267
+ "learning_rate": 9.887132480052028e-06,
268
+ "loss": 0.9712,
269
  "step": 175
270
  },
271
  {
272
  "epoch": 0.16612828795569912,
273
+ "grad_norm": 1.8296519425504616,
274
+ "learning_rate": 9.869461179574963e-06,
275
+ "loss": 0.9859,
276
  "step": 180
277
  },
278
  {
279
  "epoch": 0.1707429626211352,
280
+ "grad_norm": 1.7574451917239733,
281
+ "learning_rate": 9.850523413680588e-06,
282
+ "loss": 0.9911,
283
  "step": 185
284
  },
285
  {
286
  "epoch": 0.1753576372865713,
287
+ "grad_norm": 1.7389467033514425,
288
+ "learning_rate": 9.830324107765104e-06,
289
+ "loss": 0.9739,
290
  "step": 190
291
  },
292
  {
293
  "epoch": 0.17997231195200739,
294
+ "grad_norm": 1.7050530099463075,
295
+ "learning_rate": 9.808868515330169e-06,
296
+ "loss": 0.9803,
297
  "step": 195
298
  },
299
  {
300
  "epoch": 0.18458698661744347,
301
+ "grad_norm": 1.756999826792102,
302
+ "learning_rate": 9.786162216616561e-06,
303
+ "loss": 0.9838,
304
  "step": 200
305
  },
306
  {
307
  "epoch": 0.18458698661744347,
308
+ "eval_loss": 0.9935115575790405,
309
+ "eval_runtime": 751.2551,
310
+ "eval_samples_per_second": 20.432,
311
+ "eval_steps_per_second": 0.16,
312
  "step": 200
313
  },
314
  {
315
  "epoch": 0.18920166128287955,
316
+ "grad_norm": 1.778225787399477,
317
+ "learning_rate": 9.762211117152839e-06,
318
+ "loss": 0.9754,
319
  "step": 205
320
  },
321
  {
322
  "epoch": 0.19381633594831565,
323
+ "grad_norm": 1.6039072749046104,
324
+ "learning_rate": 9.737021446219424e-06,
325
+ "loss": 0.9577,
326
  "step": 210
327
  },
328
  {
329
  "epoch": 0.19843101061375173,
330
+ "grad_norm": 1.8562210127250676,
331
+ "learning_rate": 9.71059975522846e-06,
332
+ "loss": 0.9708,
333
  "step": 215
334
  },
335
  {
336
  "epoch": 0.20304568527918782,
337
+ "grad_norm": 1.602175292503488,
338
+ "learning_rate": 9.682952916019907e-06,
339
+ "loss": 0.9922,
340
  "step": 220
341
  },
342
  {
343
  "epoch": 0.2076603599446239,
344
+ "grad_norm": 1.5958137821643004,
345
+ "learning_rate": 9.654088119074282e-06,
346
+ "loss": 0.9965,
347
  "step": 225
348
  },
349
  {
350
  "epoch": 0.21227503461006,
351
+ "grad_norm": 1.8573846533843932,
352
+ "learning_rate": 9.624012871642545e-06,
353
+ "loss": 0.9753,
354
  "step": 230
355
  },
356
  {
357
  "epoch": 0.21688970927549608,
358
+ "grad_norm": 1.640370350766078,
359
+ "learning_rate": 9.592734995793583e-06,
360
+ "loss": 0.9925,
361
  "step": 235
362
  },
363
  {
364
  "epoch": 0.22150438394093216,
365
+ "grad_norm": 1.6631975332565576,
366
+ "learning_rate": 9.560262626379824e-06,
367
+ "loss": 0.9829,
368
  "step": 240
369
  },
370
  {
371
  "epoch": 0.22611905860636825,
372
+ "grad_norm": 1.6393108832006937,
373
+ "learning_rate": 9.52660420892149e-06,
374
+ "loss": 0.9837,
375
  "step": 245
376
  },
377
  {
378
  "epoch": 0.23073373327180433,
379
+ "grad_norm": 1.5999393221352023,
380
+ "learning_rate": 9.49176849741007e-06,
381
+ "loss": 0.9822,
382
  "step": 250
383
  },
384
  {
385
  "epoch": 0.23534840793724043,
386
+ "grad_norm": 1.6562799165061401,
387
+ "learning_rate": 9.455764552031546e-06,
388
+ "loss": 0.9876,
389
  "step": 255
390
  },
391
  {
392
  "epoch": 0.23996308260267651,
393
+ "grad_norm": 1.597298992345653,
394
+ "learning_rate": 9.418601736809989e-06,
395
+ "loss": 0.99,
396
  "step": 260
397
  },
398
  {
399
  "epoch": 0.2445777572681126,
400
+ "grad_norm": 1.615615429575249,
401
+ "learning_rate": 9.380289717172141e-06,
402
+ "loss": 0.9944,
403
  "step": 265
404
  },
405
  {
406
  "epoch": 0.24919243193354867,
407
+ "grad_norm": 1.6391764879253228,
408
+ "learning_rate": 9.340838457433588e-06,
409
+ "loss": 0.9941,
410
  "step": 270
411
  },
412
  {
413
  "epoch": 0.25380710659898476,
414
+ "grad_norm": 1.7100035873468014,
415
+ "learning_rate": 9.300258218207215e-06,
416
+ "loss": 0.9785,
417
  "step": 275
418
  },
419
  {
420
  "epoch": 0.25842178126442084,
421
+ "grad_norm": 2.095593373075969,
422
+ "learning_rate": 9.258559553734597e-06,
423
+ "loss": 0.9883,
424
  "step": 280
425
  },
426
  {
427
  "epoch": 0.26303645592985697,
428
+ "grad_norm": 1.6273765059661172,
429
+ "learning_rate": 9.215753309141e-06,
430
+ "loss": 0.9772,
431
  "step": 285
432
  },
433
  {
434
  "epoch": 0.26765113059529305,
435
+ "grad_norm": 1.6695304390680636,
436
+ "learning_rate": 9.17185061761477e-06,
437
+ "loss": 0.9881,
438
  "step": 290
439
  },
440
  {
441
  "epoch": 0.27226580526072913,
442
+ "grad_norm": 1.6630409654600378,
443
+ "learning_rate": 9.126862897511752e-06,
444
+ "loss": 0.9785,
445
  "step": 295
446
  },
447
  {
448
  "epoch": 0.2768804799261652,
449
+ "grad_norm": 1.6120537857852493,
450
+ "learning_rate": 9.080801849385585e-06,
451
+ "loss": 0.9853,
452
  "step": 300
453
  },
454
  {
455
  "epoch": 0.2768804799261652,
456
+ "eval_loss": 0.9881044626235962,
457
+ "eval_runtime": 648.9048,
458
+ "eval_samples_per_second": 23.655,
459
+ "eval_steps_per_second": 0.185,
460
  "step": 300
461
  },
462
  {
463
  "epoch": 0.2814951545916013,
464
+ "grad_norm": 1.5682941966581576,
465
+ "learning_rate": 9.033679452944574e-06,
466
+ "loss": 0.9593,
467
  "step": 305
468
  },
469
  {
470
  "epoch": 0.2861098292570374,
471
+ "grad_norm": 1.7509871813668145,
472
+ "learning_rate": 8.98550796393597e-06,
473
+ "loss": 0.9833,
474
  "step": 310
475
  },
476
  {
477
  "epoch": 0.29072450392247345,
478
+ "grad_norm": 1.5617036787004468,
479
+ "learning_rate": 8.936299910958461e-06,
480
+ "loss": 0.9732,
481
  "step": 315
482
  },
483
  {
484
  "epoch": 0.29533917858790953,
485
+ "grad_norm": 1.6332967875738513,
486
+ "learning_rate": 8.886068092203684e-06,
487
+ "loss": 0.9744,
488
  "step": 320
489
  },
490
  {
491
  "epoch": 0.2999538532533456,
492
+ "grad_norm": 1.7354150495189573,
493
+ "learning_rate": 8.834825572127632e-06,
494
+ "loss": 0.9827,
495
  "step": 325
496
  },
497
  {
498
  "epoch": 0.30456852791878175,
499
+ "grad_norm": 1.588233836037739,
500
+ "learning_rate": 8.782585678052814e-06,
501
+ "loss": 0.9962,
502
  "step": 330
503
  },
504
  {
505
  "epoch": 0.30918320258421783,
506
+ "grad_norm": 1.5809425639553816,
507
+ "learning_rate": 8.729361996702032e-06,
508
+ "loss": 0.9903,
509
  "step": 335
510
  },
511
  {
512
  "epoch": 0.3137978772496539,
513
+ "grad_norm": 1.603602687498353,
514
+ "learning_rate": 8.675168370664706e-06,
515
+ "loss": 0.981,
516
  "step": 340
517
  },
518
  {
519
  "epoch": 0.31841255191509,
520
+ "grad_norm": 1.5782147372024207,
521
+ "learning_rate": 8.620018894796654e-06,
522
+ "loss": 0.9888,
523
  "step": 345
524
  },
525
  {
526
  "epoch": 0.3230272265805261,
527
+ "grad_norm": 1.5566880880848435,
528
+ "learning_rate": 8.56392791255424e-06,
529
+ "loss": 0.9648,
530
  "step": 350
531
  },
532
  {
533
  "epoch": 0.32764190124596215,
534
+ "grad_norm": 1.655119543708647,
535
+ "learning_rate": 8.5069100122639e-06,
536
+ "loss": 0.9761,
537
  "step": 355
538
  },
539
  {
540
  "epoch": 0.33225657591139823,
541
+ "grad_norm": 1.9303593174940483,
542
+ "learning_rate": 8.448980023327943e-06,
543
+ "loss": 0.978,
544
  "step": 360
545
  },
546
  {
547
  "epoch": 0.3368712505768343,
548
+ "grad_norm": 1.6875645085180389,
549
+ "learning_rate": 8.390153012367692e-06,
550
+ "loss": 0.9704,
551
  "step": 365
552
  },
553
  {
554
  "epoch": 0.3414859252422704,
555
+ "grad_norm": 1.625080279365752,
556
+ "learning_rate": 8.330444279304887e-06,
557
+ "loss": 0.9933,
558
  "step": 370
559
  },
560
  {
561
  "epoch": 0.34610059990770653,
562
+ "grad_norm": 1.5399521803126077,
563
+ "learning_rate": 8.269869353382448e-06,
564
+ "loss": 0.9768,
565
  "step": 375
566
  },
567
  {
568
  "epoch": 0.3507152745731426,
569
+ "grad_norm": 1.5199082736016265,
570
+ "learning_rate": 8.208443989125567e-06,
571
+ "loss": 0.9702,
572
  "step": 380
573
  },
574
  {
575
  "epoch": 0.3553299492385787,
576
+ "grad_norm": 1.6898798315423416,
577
+ "learning_rate": 8.146184162244231e-06,
578
+ "loss": 0.9508,
579
  "step": 385
580
  },
581
  {
582
  "epoch": 0.35994462390401477,
583
+ "grad_norm": 1.5767639905295907,
584
+ "learning_rate": 8.083106065478192e-06,
585
+ "loss": 0.9617,
586
  "step": 390
587
  },
588
  {
589
  "epoch": 0.36455929856945085,
590
+ "grad_norm": 1.6940722444661227,
591
+ "learning_rate": 8.019226104385519e-06,
592
+ "loss": 0.9827,
593
  "step": 395
594
  },
595
  {
596
  "epoch": 0.36917397323488693,
597
+ "grad_norm": 1.542549043886959,
598
+ "learning_rate": 7.954560893075785e-06,
599
+ "loss": 0.9638,
600
  "step": 400
601
  },
602
  {
603
  "epoch": 0.36917397323488693,
604
+ "eval_loss": 0.978050708770752,
605
+ "eval_runtime": 631.4932,
606
+ "eval_samples_per_second": 24.307,
607
+ "eval_steps_per_second": 0.19,
608
  "step": 400
609
  },
610
  {
611
  "epoch": 0.373788647900323,
612
+ "grad_norm": 1.5935524851120768,
613
+ "learning_rate": 7.889127249889003e-06,
614
+ "loss": 0.9673,
615
  "step": 405
616
  },
617
  {
618
  "epoch": 0.3784033225657591,
619
+ "grad_norm": 1.497119604051003,
620
+ "learning_rate": 7.82294219302148e-06,
621
+ "loss": 0.9574,
622
  "step": 410
623
  },
624
  {
625
  "epoch": 0.3830179972311952,
626
+ "grad_norm": 1.709877148318467,
627
+ "learning_rate": 7.75602293609964e-06,
628
+ "loss": 0.9483,
629
  "step": 415
630
  },
631
  {
632
  "epoch": 0.3876326718966313,
633
+ "grad_norm": 1.5527701210281757,
634
+ "learning_rate": 7.688386883703071e-06,
635
+ "loss": 0.9904,
636
  "step": 420
637
  },
638
  {
639
  "epoch": 0.3922473465620674,
640
+ "grad_norm": 1.651581387061671,
641
+ "learning_rate": 7.620051626837878e-06,
642
+ "loss": 0.9619,
643
  "step": 425
644
  },
645
  {
646
  "epoch": 0.39686202122750347,
647
+ "grad_norm": 1.5992647254822225,
648
+ "learning_rate": 7.55103493836155e-06,
649
+ "loss": 0.9688,
650
  "step": 430
651
  },
652
  {
653
  "epoch": 0.40147669589293955,
654
+ "grad_norm": 1.4883188672166754,
655
+ "learning_rate": 7.481354768360543e-06,
656
+ "loss": 0.9493,
657
  "step": 435
658
  },
659
  {
660
  "epoch": 0.40609137055837563,
661
+ "grad_norm": 1.540282089268457,
662
+ "learning_rate": 7.411029239481766e-06,
663
+ "loss": 0.9656,
664
  "step": 440
665
  },
666
  {
667
  "epoch": 0.4107060452238117,
668
+ "grad_norm": 1.57006447700181,
669
+ "learning_rate": 7.340076642219172e-06,
670
+ "loss": 0.963,
671
  "step": 445
672
  },
673
  {
674
  "epoch": 0.4153207198892478,
675
+ "grad_norm": 1.6069506996123204,
676
+ "learning_rate": 7.268515430156722e-06,
677
+ "loss": 0.9729,
678
  "step": 450
679
  },
680
  {
681
  "epoch": 0.41993539455468387,
682
+ "grad_norm": 1.5002386553771234,
683
+ "learning_rate": 7.196364215168901e-06,
684
+ "loss": 0.9521,
685
  "step": 455
686
  },
687
  {
688
  "epoch": 0.42455006922012,
689
+ "grad_norm": 1.4646720705420488,
690
+ "learning_rate": 7.123641762580089e-06,
691
+ "loss": 0.9507,
692
  "step": 460
693
  },
694
  {
695
  "epoch": 0.4291647438855561,
696
+ "grad_norm": 1.6149847953084777,
697
+ "learning_rate": 7.050366986284019e-06,
698
+ "loss": 0.9635,
699
  "step": 465
700
  },
701
  {
702
  "epoch": 0.43377941855099217,
703
+ "grad_norm": 1.554087118833947,
704
+ "learning_rate": 6.9765589438245765e-06,
705
+ "loss": 0.9744,
706
  "step": 470
707
  },
708
  {
709
  "epoch": 0.43839409321642825,
710
+ "grad_norm": 1.4890425150210163,
711
+ "learning_rate": 6.9022368314392595e-06,
712
+ "loss": 0.9694,
713
  "step": 475
714
  },
715
  {
716
  "epoch": 0.44300876788186433,
717
+ "grad_norm": 1.5064709526763582,
718
+ "learning_rate": 6.827419979066559e-06,
719
+ "loss": 0.9577,
720
  "step": 480
721
  },
722
  {
723
  "epoch": 0.4476234425473004,
724
+ "grad_norm": 1.5765372233113244,
725
+ "learning_rate": 6.752127845318561e-06,
726
+ "loss": 0.9777,
727
  "step": 485
728
  },
729
  {
730
  "epoch": 0.4522381172127365,
731
+ "grad_norm": 1.502803008633363,
732
+ "learning_rate": 6.676380012420087e-06,
733
+ "loss": 0.9543,
734
  "step": 490
735
  },
736
  {
737
  "epoch": 0.45685279187817257,
738
+ "grad_norm": 1.480113884170601,
739
+ "learning_rate": 6.600196181115692e-06,
740
+ "loss": 0.9413,
741
  "step": 495
742
  },
743
  {
744
  "epoch": 0.46146746654360865,
745
+ "grad_norm": 1.518180225480369,
746
+ "learning_rate": 6.523596165545818e-06,
747
+ "loss": 0.9745,
748
  "step": 500
749
  },
750
  {
751
  "epoch": 0.46146746654360865,
752
+ "eval_loss": 0.9680244326591492,
753
+ "eval_runtime": 631.2665,
754
+ "eval_samples_per_second": 24.316,
755
+ "eval_steps_per_second": 0.19,
756
  "step": 500
757
  },
758
  {
759
  "epoch": 0.4660821412090448,
760
+ "grad_norm": 1.5182752103495631,
761
+ "learning_rate": 6.446599888093478e-06,
762
+ "loss": 0.9493,
763
  "step": 505
764
  },
765
  {
766
  "epoch": 0.47069681587448087,
767
+ "grad_norm": 1.6381019403661983,
768
+ "learning_rate": 6.369227374202776e-06,
769
+ "loss": 0.9655,
770
  "step": 510
771
  },
772
  {
773
  "epoch": 0.47531149053991695,
774
+ "grad_norm": 1.480944986996014,
775
+ "learning_rate": 6.291498747170615e-06,
776
+ "loss": 0.973,
777
  "step": 515
778
  },
779
  {
780
  "epoch": 0.47992616520535303,
781
+ "grad_norm": 1.515420838901527,
782
+ "learning_rate": 6.213434222912977e-06,
783
+ "loss": 0.9618,
784
  "step": 520
785
  },
786
  {
787
  "epoch": 0.4845408398707891,
788
+ "grad_norm": 1.5303480066641486,
789
+ "learning_rate": 6.135054104707093e-06,
790
+ "loss": 0.9439,
791
  "step": 525
792
  },
793
  {
794
  "epoch": 0.4891555145362252,
795
+ "grad_norm": 1.487348077399422,
796
+ "learning_rate": 6.056378777910898e-06,
797
+ "loss": 0.9565,
798
  "step": 530
799
  },
800
  {
801
  "epoch": 0.49377018920166127,
802
+ "grad_norm": 1.5775060237163985,
803
+ "learning_rate": 5.977428704661151e-06,
804
+ "loss": 0.9407,
805
  "step": 535
806
  },
807
  {
808
  "epoch": 0.49838486386709735,
809
+ "grad_norm": 1.6339716299433642,
810
+ "learning_rate": 5.898224418551565e-06,
811
+ "loss": 0.9532,
812
  "step": 540
813
  },
814
  {
815
  "epoch": 0.5029995385325334,
816
+ "grad_norm": 1.5688231463762927,
817
+ "learning_rate": 5.8187865192923644e-06,
818
+ "loss": 0.9433,
819
  "step": 545
820
  },
821
  {
822
  "epoch": 0.5076142131979695,
823
+ "grad_norm": 1.465375400099311,
824
+ "learning_rate": 5.739135667352651e-06,
825
+ "loss": 0.9494,
826
  "step": 550
827
  },
828
  {
829
  "epoch": 0.5122288878634056,
830
+ "grad_norm": 1.590916658194805,
831
+ "learning_rate": 5.659292578586957e-06,
832
+ "loss": 0.9574,
833
  "step": 555
834
  },
835
  {
836
  "epoch": 0.5168435625288417,
837
+ "grad_norm": 1.4545280778553549,
838
+ "learning_rate": 5.579278018847395e-06,
839
+ "loss": 0.9471,
840
  "step": 560
841
  },
842
  {
843
  "epoch": 0.5214582371942778,
844
+ "grad_norm": 1.4258346370497963,
845
+ "learning_rate": 5.499112798582814e-06,
846
+ "loss": 0.9456,
847
  "step": 565
848
  },
849
  {
850
  "epoch": 0.5260729118597139,
851
+ "grad_norm": 1.4412831745954977,
852
+ "learning_rate": 5.418817767426343e-06,
853
+ "loss": 0.9419,
854
  "step": 570
855
  },
856
  {
857
  "epoch": 0.53068758652515,
858
+ "grad_norm": 1.5161519877588197,
859
+ "learning_rate": 5.3384138087727555e-06,
860
+ "loss": 0.9429,
861
  "step": 575
862
  },
863
  {
864
  "epoch": 0.5353022611905861,
865
+ "grad_norm": 1.5365045658327852,
866
+ "learning_rate": 5.257921834347043e-06,
867
+ "loss": 0.9421,
868
  "step": 580
869
  },
870
  {
871
  "epoch": 0.5399169358560222,
872
+ "grad_norm": 1.5096905798025197,
873
+ "learning_rate": 5.177362778765629e-06,
874
+ "loss": 0.9418,
875
  "step": 585
876
  },
877
  {
878
  "epoch": 0.5445316105214583,
879
+ "grad_norm": 1.4931388458814197,
880
+ "learning_rate": 5.096757594091623e-06,
881
+ "loss": 0.9336,
882
  "step": 590
883
  },
884
  {
885
  "epoch": 0.5491462851868943,
886
+ "grad_norm": 1.54787574418171,
887
+ "learning_rate": 5.01612724438554e-06,
888
+ "loss": 0.9594,
889
  "step": 595
890
  },
891
  {
892
  "epoch": 0.5537609598523304,
893
+ "grad_norm": 1.4047855369892224,
894
+ "learning_rate": 4.935492700252903e-06,
895
+ "loss": 0.9396,
896
  "step": 600
897
  },
898
  {
899
  "epoch": 0.5537609598523304,
900
+ "eval_loss": 0.9567832350730896,
901
+ "eval_runtime": 1029.8073,
902
+ "eval_samples_per_second": 14.906,
903
+ "eval_steps_per_second": 0.117,
904
  "step": 600
905
  },
906
  {
907
  "epoch": 0.5583756345177665,
908
+ "grad_norm": 1.532026589616403,
909
+ "learning_rate": 4.854874933390124e-06,
910
+ "loss": 0.9464,
911
  "step": 605
912
  },
913
  {
914
  "epoch": 0.5629903091832026,
915
+ "grad_norm": 1.4445505648251018,
916
+ "learning_rate": 4.774294911130141e-06,
917
+ "loss": 0.9564,
918
  "step": 610
919
  },
920
  {
921
  "epoch": 0.5676049838486387,
922
+ "grad_norm": 1.3806609000939527,
923
+ "learning_rate": 4.6937735909891456e-06,
924
+ "loss": 0.9401,
925
  "step": 615
926
  },
927
  {
928
  "epoch": 0.5722196585140747,
929
+ "grad_norm": 1.4758989461572256,
930
+ "learning_rate": 4.6133319152158886e-06,
931
+ "loss": 0.9504,
932
  "step": 620
933
  },
934
  {
935
  "epoch": 0.5768343331795108,
936
+ "grad_norm": 1.411552408489387,
937
+ "learning_rate": 4.532990805344958e-06,
938
+ "loss": 0.9382,
939
  "step": 625
940
  },
941
  {
942
  "epoch": 0.5814490078449469,
943
+ "grad_norm": 1.4215358213534153,
944
+ "learning_rate": 4.452771156755423e-06,
945
+ "loss": 0.9457,
946
  "step": 630
947
  },
948
  {
949
  "epoch": 0.586063682510383,
950
+ "grad_norm": 1.5194235066664028,
951
+ "learning_rate": 4.372693833236319e-06,
952
+ "loss": 0.9538,
953
  "step": 635
954
  },
955
  {
956
  "epoch": 0.5906783571758191,
957
+ "grad_norm": 1.504251672226047,
958
+ "learning_rate": 4.292779661560295e-06,
959
+ "loss": 0.9541,
960
  "step": 640
961
  },
962
  {
963
  "epoch": 0.5952930318412551,
964
+ "grad_norm": 1.477570181176633,
965
+ "learning_rate": 4.213049426066946e-06,
966
+ "loss": 0.932,
967
  "step": 645
968
  },
969
  {
970
  "epoch": 0.5999077065066912,
971
+ "grad_norm": 1.4830574491662214,
972
+ "learning_rate": 4.133523863257139e-06,
973
+ "loss": 0.9499,
974
  "step": 650
975
  },
976
  {
977
  "epoch": 0.6045223811721273,
978
+ "grad_norm": 1.4506011398131606,
979
+ "learning_rate": 4.054223656399794e-06,
980
+ "loss": 0.9432,
981
  "step": 655
982
  },
983
  {
984
  "epoch": 0.6091370558375635,
985
+ "grad_norm": 1.5259712782334296,
986
+ "learning_rate": 3.975169430152524e-06,
987
+ "loss": 0.9336,
988
  "step": 660
989
  },
990
  {
991
  "epoch": 0.6137517305029996,
992
+ "grad_norm": 1.4462086241646492,
993
+ "learning_rate": 3.8963817451974915e-06,
994
+ "loss": 0.9434,
995
  "step": 665
996
  },
997
  {
998
  "epoch": 0.6183664051684357,
999
+ "grad_norm": 1.4279004522752485,
1000
+ "learning_rate": 3.817881092893934e-06,
1001
+ "loss": 0.9468,
1002
  "step": 670
1003
  },
1004
  {
1005
  "epoch": 0.6229810798338717,
1006
+ "grad_norm": 1.4169615015471333,
1007
+ "learning_rate": 3.7396878899486896e-06,
1008
+ "loss": 0.9416,
1009
  "step": 675
1010
  },
1011
  {
1012
  "epoch": 0.6275957544993078,
1013
+ "grad_norm": 1.4479507419493,
1014
+ "learning_rate": 3.6618224731061658e-06,
1015
+ "loss": 0.9388,
1016
  "step": 680
1017
  },
1018
  {
1019
  "epoch": 0.6322104291647439,
1020
+ "grad_norm": 1.4589224612269058,
1021
+ "learning_rate": 3.584305093859082e-06,
1022
+ "loss": 0.9384,
1023
  "step": 685
1024
  },
1025
  {
1026
  "epoch": 0.63682510383018,
1027
+ "grad_norm": 1.4183964058904668,
1028
+ "learning_rate": 3.507155913181402e-06,
1029
+ "loss": 0.9347,
1030
  "step": 690
1031
  },
1032
  {
1033
  "epoch": 0.6414397784956161,
1034
+ "grad_norm": 1.4542672740750342,
1035
+ "learning_rate": 3.4303949962848003e-06,
1036
+ "loss": 0.9494,
1037
  "step": 695
1038
  },
1039
  {
1040
  "epoch": 0.6460544531610521,
1041
+ "grad_norm": 1.466448569041173,
1042
+ "learning_rate": 3.3540423074000323e-06,
1043
+ "loss": 0.9176,
1044
  "step": 700
1045
  },
1046
  {
1047
  "epoch": 0.6460544531610521,
1048
+ "eval_loss": 0.9464961290359497,
1049
+ "eval_runtime": 630.6464,
1050
+ "eval_samples_per_second": 24.34,
1051
+ "eval_steps_per_second": 0.19,
1052
  "step": 700
1053
  },
1054
  {
1055
  "epoch": 0.6506691278264882,
1056
+ "grad_norm": 1.4112284829468522,
1057
+ "learning_rate": 3.278117704584577e-06,
1058
+ "loss": 0.9164,
1059
  "step": 705
1060
  },
1061
  {
1062
  "epoch": 0.6552838024919243,
1063
+ "grad_norm": 1.4108457010710305,
1064
+ "learning_rate": 3.202640934557884e-06,
1065
+ "loss": 0.9213,
1066
  "step": 710
1067
  },
1068
  {
1069
  "epoch": 0.6598984771573604,
1070
+ "grad_norm": 1.4189577161809213,
1071
+ "learning_rate": 3.127631627565586e-06,
1072
+ "loss": 0.9368,
1073
  "step": 715
1074
  },
1075
  {
1076
  "epoch": 0.6645131518227965,
1077
+ "grad_norm": 1.452002361321377,
1078
+ "learning_rate": 3.053109292273996e-06,
1079
+ "loss": 0.9372,
1080
  "step": 720
1081
  },
1082
  {
1083
  "epoch": 0.6691278264882325,
1084
+ "grad_norm": 1.4024056929659159,
1085
+ "learning_rate": 2.9790933106962328e-06,
1086
+ "loss": 0.925,
1087
  "step": 725
1088
  },
1089
  {
1090
  "epoch": 0.6737425011536686,
1091
+ "grad_norm": 1.4306957271780452,
1092
+ "learning_rate": 2.9056029331512853e-06,
1093
+ "loss": 0.9259,
1094
  "step": 730
1095
  },
1096
  {
1097
  "epoch": 0.6783571758191047,
1098
+ "grad_norm": 1.3638433368150233,
1099
+ "learning_rate": 2.8326572732573167e-06,
1100
+ "loss": 0.9298,
1101
  "step": 735
1102
  },
1103
  {
1104
  "epoch": 0.6829718504845408,
1105
+ "grad_norm": 1.438840741138399,
1106
+ "learning_rate": 2.7602753029605456e-06,
1107
+ "loss": 0.9312,
1108
  "step": 740
1109
  },
1110
  {
1111
  "epoch": 0.687586525149977,
1112
+ "grad_norm": 1.3538477302959735,
1113
+ "learning_rate": 2.688475847600947e-06,
1114
+ "loss": 0.9328,
1115
  "step": 745
1116
  },
1117
  {
1118
  "epoch": 0.6922011998154131,
1119
+ "grad_norm": 1.473692645733402,
1120
+ "learning_rate": 2.6172775810161104e-06,
1121
+ "loss": 0.9239,
1122
  "step": 750
1123
  },
1124
  {
1125
  "epoch": 0.6968158744808491,
1126
+ "grad_norm": 1.5623662932746458,
1127
+ "learning_rate": 2.546699020684471e-06,
1128
+ "loss": 0.9371,
1129
  "step": 755
1130
  },
1131
  {
1132
  "epoch": 0.7014305491462852,
1133
+ "grad_norm": 1.4016556607303827,
1134
+ "learning_rate": 2.4767585229092368e-06,
1135
+ "loss": 0.9308,
1136
  "step": 760
1137
  },
1138
  {
1139
  "epoch": 0.7060452238117213,
1140
+ "grad_norm": 1.3899751453383695,
1141
+ "learning_rate": 2.407474278044215e-06,
1142
+ "loss": 0.9332,
1143
  "step": 765
1144
  },
1145
  {
1146
  "epoch": 0.7106598984771574,
1147
+ "grad_norm": 1.4345839815007075,
1148
+ "learning_rate": 2.3388643057628025e-06,
1149
+ "loss": 0.9283,
1150
  "step": 770
1151
  },
1152
  {
1153
  "epoch": 0.7152745731425935,
1154
+ "grad_norm": 1.4014510611067819,
1155
+ "learning_rate": 2.2709464503713785e-06,
1156
+ "loss": 0.9196,
1157
  "step": 775
1158
  },
1159
  {
1160
  "epoch": 0.7198892478080295,
1161
+ "grad_norm": 1.4343098995975527,
1162
+ "learning_rate": 2.2037383761682877e-06,
1163
+ "loss": 0.9211,
1164
  "step": 780
1165
  },
1166
  {
1167
  "epoch": 0.7245039224734656,
1168
+ "grad_norm": 1.342137285764838,
1169
+ "learning_rate": 2.1372575628496662e-06,
1170
+ "loss": 0.9206,
1171
  "step": 785
1172
  },
1173
  {
1174
  "epoch": 0.7291185971389017,
1175
+ "grad_norm": 1.396553897850799,
1176
+ "learning_rate": 2.071521300963246e-06,
1177
+ "loss": 0.9324,
1178
  "step": 790
1179
  },
1180
  {
1181
  "epoch": 0.7337332718043378,
1182
+ "grad_norm": 1.4382708247406493,
1183
+ "learning_rate": 2.0065466874113944e-06,
1184
+ "loss": 0.9159,
1185
  "step": 795
1186
  },
1187
  {
1188
  "epoch": 0.7383479464697739,
1189
+ "grad_norm": 1.3548453547021793,
1190
+ "learning_rate": 1.9423506210044746e-06,
1191
+ "loss": 0.9067,
1192
  "step": 800
1193
  },
1194
  {
1195
  "epoch": 0.7383479464697739,
1196
+ "eval_loss": 0.9378637671470642,
1197
+ "eval_runtime": 663.1431,
1198
+ "eval_samples_per_second": 23.147,
1199
+ "eval_steps_per_second": 0.181,
1200
  "step": 800
1201
  },
1202
  {
1203
  "epoch": 0.7429626211352099,
1204
+ "grad_norm": 1.3924317571532645,
1205
+ "learning_rate": 1.8789497980657644e-06,
1206
+ "loss": 0.9387,
1207
  "step": 805
1208
  },
1209
  {
1210
  "epoch": 0.747577295800646,
1211
+ "grad_norm": 1.4165077817002125,
1212
+ "learning_rate": 1.8163607080890143e-06,
1213
+ "loss": 0.9593,
1214
  "step": 810
1215
  },
1216
  {
1217
  "epoch": 0.7521919704660821,
1218
+ "grad_norm": 1.4141204258596356,
1219
+ "learning_rate": 1.7545996294498013e-06,
1220
+ "loss": 0.9374,
1221
  "step": 815
1222
  },
1223
  {
1224
  "epoch": 0.7568066451315182,
1225
+ "grad_norm": 1.3739608338173115,
1226
+ "learning_rate": 1.6936826251718075e-06,
1227
+ "loss": 0.9345,
1228
  "step": 820
1229
  },
1230
  {
1231
  "epoch": 0.7614213197969543,
1232
+ "grad_norm": 1.4298339467420962,
1233
+ "learning_rate": 1.6336255387490846e-06,
1234
+ "loss": 0.9185,
1235
  "step": 825
1236
  },
1237
  {
1238
  "epoch": 0.7660359944623903,
1239
+ "grad_norm": 1.4280106425956387,
1240
+ "learning_rate": 1.574443990025436e-06,
1241
+ "loss": 0.9251,
1242
  "step": 830
1243
  },
1244
  {
1245
  "epoch": 0.7706506691278265,
1246
+ "grad_norm": 1.4182309514153764,
1247
+ "learning_rate": 1.5161533711319454e-06,
1248
+ "loss": 0.9337,
1249
  "step": 835
1250
  },
1251
  {
1252
  "epoch": 0.7752653437932626,
1253
+ "grad_norm": 1.3937433613798544,
1254
+ "learning_rate": 1.4587688424837538e-06,
1255
+ "loss": 0.9448,
1256
  "step": 840
1257
  },
1258
  {
1259
  "epoch": 0.7798800184586987,
1260
+ "grad_norm": 1.456151982504558,
1261
+ "learning_rate": 1.4023053288370803e-06,
1262
+ "loss": 0.9129,
1263
  "step": 845
1264
  },
1265
  {
1266
  "epoch": 0.7844946931241348,
1267
+ "grad_norm": 1.3853091041139631,
1268
+ "learning_rate": 1.3467775154075425e-06,
1269
+ "loss": 0.9213,
1270
  "step": 850
1271
  },
1272
  {
1273
  "epoch": 0.7891093677895709,
1274
+ "grad_norm": 1.4547278352297517,
1275
+ "learning_rate": 1.2921998440507838e-06,
1276
+ "loss": 0.9211,
1277
  "step": 855
1278
  },
1279
  {
1280
  "epoch": 0.7937240424550069,
1281
+ "grad_norm": 1.4019092163127755,
1282
+ "learning_rate": 1.2385865095063808e-06,
1283
+ "loss": 0.9189,
1284
  "step": 860
1285
  },
1286
  {
1287
  "epoch": 0.798338717120443,
1288
+ "grad_norm": 1.3640880739873995,
1289
+ "learning_rate": 1.1859514557060358e-06,
1290
+ "loss": 0.9184,
1291
  "step": 865
1292
  },
1293
  {
1294
  "epoch": 0.8029533917858791,
1295
+ "grad_norm": 1.4032390358058673,
1296
+ "learning_rate": 1.1343083721469867e-06,
1297
+ "loss": 0.9234,
1298
  "step": 870
1299
  },
1300
  {
1301
  "epoch": 0.8075680664513152,
1302
+ "grad_norm": 1.365688843862272,
1303
+ "learning_rate": 1.0836706903316052e-06,
1304
+ "loss": 0.9244,
1305
  "step": 875
1306
  },
1307
  {
1308
  "epoch": 0.8121827411167513,
1309
+ "grad_norm": 1.4146840493944073,
1310
+ "learning_rate": 1.0340515802740781e-06,
1311
+ "loss": 0.9197,
1312
  "step": 880
1313
  },
1314
  {
1315
  "epoch": 0.8167974157821873,
1316
+ "grad_norm": 1.3895617974460883,
1317
+ "learning_rate": 9.854639470751182e-07,
1318
+ "loss": 0.9194,
1319
  "step": 885
1320
  },
1321
  {
1322
  "epoch": 0.8214120904476234,
1323
+ "grad_norm": 1.4357287631638345,
1324
+ "learning_rate": 9.379204275655524e-07,
1325
+ "loss": 0.9156,
1326
  "step": 890
1327
  },
1328
  {
1329
  "epoch": 0.8260267651130595,
1330
+ "grad_norm": 1.3844689117152469,
1331
+ "learning_rate": 8.914333870197022e-07,
1332
+ "loss": 0.9355,
1333
  "step": 895
1334
  },
1335
  {
1336
  "epoch": 0.8306414397784956,
1337
+ "grad_norm": 1.3689079608163703,
1338
+ "learning_rate": 8.460149159393766e-07,
1339
+ "loss": 0.9221,
1340
  "step": 900
1341
  },
1342
  {
1343
  "epoch": 0.8306414397784956,
1344
+ "eval_loss": 0.9320199489593506,
1345
+ "eval_runtime": 630.5251,
1346
+ "eval_samples_per_second": 24.345,
1347
+ "eval_steps_per_second": 0.19,
1348
  "step": 900
1349
  },
1350
  {
1351
  "epoch": 0.8352561144439317,
1352
+ "grad_norm": 1.373835743305699,
1353
+ "learning_rate": 8.016768269093389e-07,
1354
+ "loss": 0.9225,
1355
  "step": 905
1356
  },
1357
  {
1358
  "epoch": 0.8398707891093677,
1359
+ "grad_norm": 1.3862642218872392,
1360
+ "learning_rate": 7.584306515250461e-07,
1361
+ "loss": 0.9118,
1362
  "step": 910
1363
  },
1364
  {
1365
  "epoch": 0.8444854637748038,
1366
+ "grad_norm": 1.427561354442013,
1367
+ "learning_rate": 7.162876373934813e-07,
1368
+ "loss": 0.9257,
1369
  "step": 915
1370
  },
1371
  {
1372
  "epoch": 0.84910013844024,
1373
+ "grad_norm": 1.3318980900547395,
1374
+ "learning_rate": 6.752587452078297e-07,
1375
+ "loss": 0.9168,
1376
  "step": 920
1377
  },
1378
  {
1379
  "epoch": 0.8537148131056761,
1380
+ "grad_norm": 1.426821326563428,
1381
+ "learning_rate": 6.353546458967957e-07,
1382
+ "loss": 0.9269,
1383
  "step": 925
1384
  },
1385
  {
1386
  "epoch": 0.8583294877711122,
1387
+ "grad_norm": 1.4130164875607825,
1388
+ "learning_rate": 5.965857178492629e-07,
1389
+ "loss": 0.9177,
1390
  "step": 930
1391
  },
1392
  {
1393
  "epoch": 0.8629441624365483,
1394
+ "grad_norm": 1.37338302698056,
1395
+ "learning_rate": 5.589620442150579e-07,
1396
+ "loss": 0.908,
1397
  "step": 935
1398
  },
1399
  {
1400
  "epoch": 0.8675588371019843,
1401
+ "grad_norm": 1.3675248953715173,
1402
+ "learning_rate": 5.224934102824824e-07,
1403
+ "loss": 0.9018,
1404
  "step": 940
1405
  },
1406
  {
1407
  "epoch": 0.8721735117674204,
1408
+ "grad_norm": 1.3837632491876184,
1409
+ "learning_rate": 4.871893009333345e-07,
1410
+ "loss": 0.9266,
1411
  "step": 945
1412
  },
1413
  {
1414
  "epoch": 0.8767881864328565,
1415
+ "grad_norm": 1.3573040010924988,
1416
+ "learning_rate": 4.5305889817603757e-07,
1417
+ "loss": 0.9303,
1418
  "step": 950
1419
  },
1420
  {
1421
  "epoch": 0.8814028610982926,
1422
+ "grad_norm": 1.3246603553725993,
1423
+ "learning_rate": 4.201110787575619e-07,
1424
+ "loss": 0.9003,
1425
  "step": 955
1426
  },
1427
  {
1428
  "epoch": 0.8860175357637287,
1429
+ "grad_norm": 1.3518273420371894,
1430
+ "learning_rate": 3.883544118547289e-07,
1431
+ "loss": 0.9223,
1432
  "step": 960
1433
  },
1434
  {
1435
  "epoch": 0.8906322104291647,
1436
+ "grad_norm": 1.3931507467693585,
1437
+ "learning_rate": 3.5779715684550966e-07,
1438
+ "loss": 0.9233,
1439
  "step": 965
1440
  },
1441
  {
1442
  "epoch": 0.8952468850946008,
1443
+ "grad_norm": 1.4917970308046273,
1444
+ "learning_rate": 3.284472611609024e-07,
1445
+ "loss": 0.9262,
1446
  "step": 970
1447
  },
1448
  {
1449
  "epoch": 0.8998615597600369,
1450
+ "grad_norm": 1.3457919919745878,
1451
+ "learning_rate": 3.00312358217934e-07,
1452
+ "loss": 0.9061,
1453
  "step": 975
1454
  },
1455
  {
1456
  "epoch": 0.904476234425473,
1457
+ "grad_norm": 1.399847886161994,
1458
+ "learning_rate": 2.7339976543434065e-07,
1459
+ "loss": 0.9303,
1460
  "step": 980
1461
  },
1462
  {
1463
  "epoch": 0.9090909090909091,
1464
+ "grad_norm": 1.4141769147196013,
1465
+ "learning_rate": 2.4771648232542524e-07,
1466
+ "loss": 0.9184,
1467
  "step": 985
1468
  },
1469
  {
1470
  "epoch": 0.9137055837563451,
1471
+ "grad_norm": 1.4105606054413926,
1472
+ "learning_rate": 2.2326918868360636e-07,
1473
+ "loss": 0.923,
1474
  "step": 990
1475
  },
1476
  {
1477
  "epoch": 0.9183202584217812,
1478
+ "grad_norm": 1.3589359885492762,
1479
+ "learning_rate": 2.000642428411087e-07,
1480
+ "loss": 0.9274,
1481
  "step": 995
1482
  },
1483
  {
1484
  "epoch": 0.9229349330872173,
1485
+ "grad_norm": 1.3455010064913675,
1486
+ "learning_rate": 1.7810768001627455e-07,
1487
+ "loss": 0.9087,
1488
  "step": 1000
1489
  },
1490
  {
1491
  "epoch": 0.9229349330872173,
1492
+ "eval_loss": 0.9292727112770081,
1493
+ "eval_runtime": 1081.8404,
1494
+ "eval_samples_per_second": 14.189,
1495
+ "eval_steps_per_second": 0.111,
1496
  "step": 1000
1497
  },
1498
  {
1499
  "epoch": 0.9275496077526535,
1500
+ "grad_norm": 1.3936222044373034,
1501
+ "learning_rate": 1.5740521074389837e-07,
1502
+ "loss": 0.9294,
1503
  "step": 1005
1504
  },
1505
  {
1506
  "epoch": 0.9321642824180896,
1507
+ "grad_norm": 1.6247273551888408,
1508
+ "learning_rate": 1.3796221939001598e-07,
1509
+ "loss": 0.9291,
1510
  "step": 1010
1511
  },
1512
  {
1513
  "epoch": 0.9367789570835257,
1514
+ "grad_norm": 1.386776941906468,
1515
+ "learning_rate": 1.1978376275151915e-07,
1516
+ "loss": 0.9116,
1517
  "step": 1015
1518
  },
1519
  {
1520
  "epoch": 0.9413936317489617,
1521
+ "grad_norm": 1.3387125746511046,
1522
+ "learning_rate": 1.0287456874096824e-07,
1523
+ "loss": 0.9137,
1524
  "step": 1020
1525
  },
1526
  {
1527
  "epoch": 0.9460083064143978,
1528
+ "grad_norm": 1.3731528656657377,
1529
+ "learning_rate": 8.723903515694132e-08,
1530
+ "loss": 0.9208,
1531
  "step": 1025
1532
  },
1533
  {
1534
  "epoch": 0.9506229810798339,
1535
+ "grad_norm": 1.3785247362215645,
1536
+ "learning_rate": 7.28812285402386e-08,
1537
+ "loss": 0.9062,
1538
  "step": 1030
1539
  },
1540
  {
1541
  "epoch": 0.95523765574527,
1542
+ "grad_norm": 1.4102762199212637,
1543
+ "learning_rate": 5.98048831162451e-08,
1544
+ "loss": 0.9286,
1545
  "step": 1035
1546
  },
1547
  {
1548
  "epoch": 0.9598523304107061,
1549
+ "grad_norm": 1.3925062428618178,
1550
+ "learning_rate": 4.801339982372144e-08,
1551
+ "loss": 0.9066,
1552
  "step": 1040
1553
  },
1554
  {
1555
  "epoch": 0.9644670050761421,
1556
+ "grad_norm": 1.3653301284738,
1557
+ "learning_rate": 3.750984543027358e-08,
1558
+ "loss": 0.9197,
1559
  "step": 1045
1560
  },
1561
  {
1562
  "epoch": 0.9690816797415782,
1563
+ "grad_norm": 1.3996435505145486,
1564
+ "learning_rate": 2.8296951734740896e-08,
1565
+ "loss": 0.9303,
1566
  "step": 1050
1567
  },
1568
  {
1569
  "epoch": 0.9736963544070143,
1570
+ "grad_norm": 1.367940583530902,
1571
+ "learning_rate": 2.0377114856700575e-08,
1572
+ "loss": 0.9286,
1573
  "step": 1055
1574
  },
1575
  {
1576
  "epoch": 0.9783110290724504,
1577
+ "grad_norm": 1.4028823114794313,
1578
+ "learning_rate": 1.3752394613274488e-08,
1579
+ "loss": 0.9169,
1580
  "step": 1060
1581
  },
1582
  {
1583
  "epoch": 0.9829257037378865,
1584
+ "grad_norm": 1.354080160910544,
1585
+ "learning_rate": 8.424513983408267e-09,
1586
+ "loss": 0.9156,
1587
  "step": 1065
1588
  },
1589
  {
1590
  "epoch": 0.9875403784033225,
1591
+ "grad_norm": 1.3348691278230316,
1592
+ "learning_rate": 4.3948586597525325e-09,
1593
+ "loss": 0.9243,
1594
  "step": 1070
1595
  },
1596
  {
1597
  "epoch": 0.9921550530687586,
1598
+ "grad_norm": 1.3552426228812648,
1599
+ "learning_rate": 1.664476688265082e-09,
1600
+ "loss": 0.9195,
1601
  "step": 1075
1602
  },
1603
  {
1604
  "epoch": 0.9967697277341947,
1605
+ "grad_norm": 1.3989954260059947,
1606
+ "learning_rate": 2.3407819563503463e-10,
1607
+ "loss": 0.9093,
1608
  "step": 1080
1609
  },
1610
  {
1611
  "epoch": 0.9995385325334564,
1612
  "step": 1083,
1613
  "total_flos": 453306954547200.0,
1614
+ "train_loss": 0.9547446678880179,
1615
+ "train_runtime": 38927.5133,
1616
+ "train_samples_per_second": 3.563,
1617
+ "train_steps_per_second": 0.028
1618
  }
1619
  ],
1620
  "logging_steps": 5,