qingyangzhang commited on
Commit
2a1d86e
·
verified ·
1 Parent(s): 4af3daf

Model save

Browse files
README.md CHANGED
@@ -1,10 +1,8 @@
1
  ---
2
- datasets: qingyangzhang/natural_reasoning_simple
3
  library_name: transformers
4
  model_name: Qwen2.5-3B-SFT-NR
5
  tags:
6
  - generated_from_trainer
7
- - open-r1
8
  - trl
9
  - sft
10
  licence: license
@@ -12,7 +10,7 @@ licence: license
12
 
13
  # Model Card for Qwen2.5-3B-SFT-NR
14
 
15
- This model is a fine-tuned version of [None](https://huggingface.co/None) on the [qingyangzhang/natural_reasoning_simple](https://huggingface.co/datasets/qingyangzhang/natural_reasoning_simple) dataset.
16
  It has been trained using [TRL](https://github.com/huggingface/trl).
17
 
18
  ## Quick start
@@ -28,7 +26,7 @@ print(output["generated_text"])
28
 
29
  ## Training procedure
30
 
31
- [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/rq6uid9l)
32
 
33
 
34
  This model was trained with SFT.
 
1
  ---
 
2
  library_name: transformers
3
  model_name: Qwen2.5-3B-SFT-NR
4
  tags:
5
  - generated_from_trainer
 
6
  - trl
7
  - sft
8
  licence: license
 
10
 
11
  # Model Card for Qwen2.5-3B-SFT-NR
12
 
13
+ This model is a fine-tuned version of [None](https://huggingface.co/None).
14
  It has been trained using [TRL](https://github.com/huggingface/trl).
15
 
16
  ## Quick start
 
26
 
27
  ## Training procedure
28
 
29
+ [<img src="https://raw.githubusercontent.com/wandb/assets/main/wandb-github-badge-28.svg" alt="Visualize in Weights & Biases" width="150" height="24"/>](https://wandb.ai/zqyoung1127-tianjin-university/huggingface/runs/iwcrfhuo)
30
 
31
 
32
  This model was trained with SFT.
all_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9990049751243781,
3
- "total_flos": 5.849427398046515e+16,
4
- "train_loss": 0.6000438257755036,
5
- "train_runtime": 5544.176,
6
  "train_samples": 12058,
7
- "train_samples_per_second": 2.175,
8
- "train_steps_per_second": 0.045
9
  }
 
1
  {
2
  "epoch": 0.9990049751243781,
3
+ "total_flos": 1.3756419824156672e+17,
4
+ "train_loss": 0.5190648520847716,
5
+ "train_runtime": 16049.3674,
6
  "train_samples": 12058,
7
+ "train_samples_per_second": 0.751,
8
+ "train_steps_per_second": 0.016
9
  }
model-00001-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:998c8a02208e7f7db685f72280017c50de3dba0680318964ea7bb91011282c69
3
  size 4877660776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21822537d93795139db5e2a05d0782ba2b1d1d82fba5d8d4da29ce9f649a2e0a
3
  size 4877660776
model-00002-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:34605058b2d391627549bbdf623845d3f38dd00b9affc7ef9684cdb5393f37f4
3
  size 4932751008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e879926cd01bf245c5c5730b7aaf31a715fb1fda005a5c5a3c91571024ed53ea
3
  size 4932751008
model-00003-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32b5a4cc8762f4784f4712433ff897f7c42b168b031bf75e8587dfcc3672057d
3
  size 4330865200
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a8e188f27852a2145dd3852bc076dd9887ec79e37d7c081bcffa403a964886ac
3
  size 4330865200
model-00004-of-00004.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e6c38a45f0cfcc2cea2e1dcf080e6b4b22b19a12f1cb365416592aa582fd20c
3
  size 1089994880
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4815718d8936325e50130a9241155b6e796d4a2f031863cdf8fc1fb8d2715ad
3
  size 1089994880
train_results.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "epoch": 0.9990049751243781,
3
- "total_flos": 5.849427398046515e+16,
4
- "train_loss": 0.6000438257755036,
5
- "train_runtime": 5544.176,
6
  "train_samples": 12058,
7
- "train_samples_per_second": 2.175,
8
- "train_steps_per_second": 0.045
9
  }
 
1
  {
2
  "epoch": 0.9990049751243781,
3
+ "total_flos": 1.3756419824156672e+17,
4
+ "train_loss": 0.5190648520847716,
5
+ "train_runtime": 16049.3674,
6
  "train_samples": 12058,
7
+ "train_samples_per_second": 0.751,
8
+ "train_steps_per_second": 0.016
9
  }
trainer_state.json CHANGED
@@ -10,1769 +10,1769 @@
10
  "log_history": [
11
  {
12
  "epoch": 0.003980099502487562,
13
- "grad_norm": 1.8041054010391235,
14
  "learning_rate": 1e-06,
15
- "loss": 1.2189,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.007960199004975124,
20
- "grad_norm": 1.606231451034546,
21
  "learning_rate": 1e-06,
22
- "loss": 1.1372,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.011940298507462687,
27
- "grad_norm": 1.5864207744598389,
28
  "learning_rate": 1e-06,
29
- "loss": 1.1304,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.015920398009950248,
34
- "grad_norm": 1.6760976314544678,
35
  "learning_rate": 1e-06,
36
- "loss": 1.3062,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.01990049751243781,
41
- "grad_norm": 1.0894922018051147,
42
  "learning_rate": 1e-06,
43
- "loss": 1.0851,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.023880597014925373,
48
- "grad_norm": 1.197535514831543,
49
  "learning_rate": 1e-06,
50
- "loss": 1.2047,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.027860696517412936,
55
- "grad_norm": 1.1388044357299805,
56
  "learning_rate": 1e-06,
57
- "loss": 1.1113,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.031840796019900496,
62
- "grad_norm": 1.118986964225769,
63
  "learning_rate": 1e-06,
64
- "loss": 1.129,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03582089552238806,
69
- "grad_norm": 1.1859782934188843,
70
  "learning_rate": 1e-06,
71
- "loss": 1.1824,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.03980099502487562,
76
- "grad_norm": 1.0059858560562134,
77
  "learning_rate": 1e-06,
78
- "loss": 1.0065,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04378109452736319,
83
- "grad_norm": 1.088222622871399,
84
  "learning_rate": 1e-06,
85
- "loss": 1.0444,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.04776119402985075,
90
- "grad_norm": 1.1014976501464844,
91
  "learning_rate": 1e-06,
92
- "loss": 1.0654,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.051741293532338306,
97
- "grad_norm": 0.9451438188552856,
98
  "learning_rate": 1e-06,
99
- "loss": 0.9278,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.05572139303482587,
104
- "grad_norm": 1.109749436378479,
105
  "learning_rate": 1e-06,
106
- "loss": 1.0397,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.05970149253731343,
111
- "grad_norm": 1.0444092750549316,
112
  "learning_rate": 1e-06,
113
- "loss": 1.0032,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06368159203980099,
118
- "grad_norm": 1.056223750114441,
119
  "learning_rate": 1e-06,
120
- "loss": 0.9843,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.06766169154228856,
125
- "grad_norm": 0.9420551657676697,
126
  "learning_rate": 1e-06,
127
- "loss": 0.8722,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07164179104477612,
132
- "grad_norm": 0.9519243240356445,
133
  "learning_rate": 1e-06,
134
- "loss": 0.8713,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.07562189054726368,
139
- "grad_norm": 0.8667258620262146,
140
  "learning_rate": 1e-06,
141
- "loss": 0.8384,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.07960199004975124,
146
- "grad_norm": 0.9024590253829956,
147
  "learning_rate": 1e-06,
148
- "loss": 0.8198,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.08358208955223881,
153
- "grad_norm": 0.8790098428726196,
154
  "learning_rate": 1e-06,
155
- "loss": 0.8539,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.08756218905472637,
160
- "grad_norm": 0.7695945501327515,
161
  "learning_rate": 1e-06,
162
- "loss": 0.797,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09154228855721393,
167
- "grad_norm": 0.830602765083313,
168
  "learning_rate": 1e-06,
169
- "loss": 0.8038,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.0955223880597015,
174
- "grad_norm": 0.7355982661247253,
175
  "learning_rate": 1e-06,
176
- "loss": 0.7901,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.09950248756218906,
181
- "grad_norm": 0.7058648467063904,
182
  "learning_rate": 1e-06,
183
- "loss": 0.7933,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.10348258706467661,
188
- "grad_norm": 0.8061387538909912,
189
  "learning_rate": 1e-06,
190
- "loss": 0.7369,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.10746268656716418,
195
- "grad_norm": 0.7414054870605469,
196
  "learning_rate": 1e-06,
197
- "loss": 0.7798,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.11144278606965174,
202
- "grad_norm": 0.7229103446006775,
203
  "learning_rate": 1e-06,
204
- "loss": 0.7071,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.1154228855721393,
209
- "grad_norm": 0.6890265345573425,
210
  "learning_rate": 1e-06,
211
- "loss": 0.649,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.11940298507462686,
216
- "grad_norm": 0.6917344927787781,
217
  "learning_rate": 1e-06,
218
- "loss": 0.7381,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.12338308457711443,
223
- "grad_norm": 0.6370529532432556,
224
  "learning_rate": 1e-06,
225
- "loss": 0.7016,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.12736318407960198,
230
- "grad_norm": 0.5392922163009644,
231
  "learning_rate": 1e-06,
232
- "loss": 0.5861,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.13134328358208955,
237
- "grad_norm": 0.5614864826202393,
238
  "learning_rate": 1e-06,
239
- "loss": 0.637,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.13532338308457711,
244
- "grad_norm": 0.5575302839279175,
245
  "learning_rate": 1e-06,
246
- "loss": 0.6303,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.13930348258706468,
251
- "grad_norm": 0.5416925549507141,
252
  "learning_rate": 1e-06,
253
- "loss": 0.6533,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.14328358208955225,
258
- "grad_norm": 0.5551822185516357,
259
  "learning_rate": 1e-06,
260
- "loss": 0.6362,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1472636815920398,
265
- "grad_norm": 0.5346453785896301,
266
  "learning_rate": 1e-06,
267
- "loss": 0.6369,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.15124378109452735,
272
- "grad_norm": 0.48347029089927673,
273
  "learning_rate": 1e-06,
274
- "loss": 0.6146,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.15522388059701492,
279
- "grad_norm": 0.5139867663383484,
280
  "learning_rate": 1e-06,
281
- "loss": 0.6108,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.15920398009950248,
286
- "grad_norm": 0.492990642786026,
287
  "learning_rate": 1e-06,
288
- "loss": 0.6167,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.16318407960199005,
293
- "grad_norm": 0.4089691638946533,
294
  "learning_rate": 1e-06,
295
- "loss": 0.5995,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.16716417910447762,
300
- "grad_norm": 0.3620274066925049,
301
  "learning_rate": 1e-06,
302
- "loss": 0.5853,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.17114427860696518,
307
- "grad_norm": 0.35234397649765015,
308
  "learning_rate": 1e-06,
309
- "loss": 0.5983,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.17512437810945275,
314
- "grad_norm": 0.3323567509651184,
315
  "learning_rate": 1e-06,
316
- "loss": 0.5675,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.1791044776119403,
321
- "grad_norm": 0.3100694417953491,
322
  "learning_rate": 1e-06,
323
- "loss": 0.6015,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.18308457711442785,
328
- "grad_norm": 0.31179943680763245,
329
  "learning_rate": 1e-06,
330
- "loss": 0.592,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.18706467661691542,
335
- "grad_norm": 0.3240714967250824,
336
  "learning_rate": 1e-06,
337
- "loss": 0.5945,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.191044776119403,
342
- "grad_norm": 0.30923616886138916,
343
  "learning_rate": 1e-06,
344
- "loss": 0.5788,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.19502487562189055,
349
- "grad_norm": 0.3096090257167816,
350
  "learning_rate": 1e-06,
351
- "loss": 0.5884,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.19900497512437812,
356
- "grad_norm": 0.2709506154060364,
357
  "learning_rate": 1e-06,
358
- "loss": 0.544,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.20298507462686566,
363
- "grad_norm": 0.3078024089336395,
364
  "learning_rate": 1e-06,
365
- "loss": 0.5854,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.20696517412935322,
370
- "grad_norm": 0.31205838918685913,
371
  "learning_rate": 1e-06,
372
- "loss": 0.5846,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2109452736318408,
377
- "grad_norm": 0.2879401743412018,
378
  "learning_rate": 1e-06,
379
- "loss": 0.5937,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.21492537313432836,
384
- "grad_norm": 0.2684524953365326,
385
  "learning_rate": 1e-06,
386
- "loss": 0.5209,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.21890547263681592,
391
- "grad_norm": 0.27748343348503113,
392
  "learning_rate": 1e-06,
393
- "loss": 0.5575,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.2228855721393035,
398
- "grad_norm": 0.31936174631118774,
399
  "learning_rate": 1e-06,
400
- "loss": 0.6562,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.22686567164179106,
405
- "grad_norm": 0.30099964141845703,
406
  "learning_rate": 1e-06,
407
- "loss": 0.5912,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.2308457711442786,
412
- "grad_norm": 0.30249732732772827,
413
  "learning_rate": 1e-06,
414
- "loss": 0.657,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.23482587064676616,
419
- "grad_norm": 0.28535589575767517,
420
  "learning_rate": 1e-06,
421
- "loss": 0.5827,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.23880597014925373,
426
- "grad_norm": 0.2907682955265045,
427
  "learning_rate": 1e-06,
428
- "loss": 0.5745,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.2427860696517413,
433
- "grad_norm": 0.2832544445991516,
434
  "learning_rate": 1e-06,
435
- "loss": 0.5534,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.24676616915422886,
440
- "grad_norm": 0.2882274389266968,
441
  "learning_rate": 1e-06,
442
- "loss": 0.5717,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2507462686567164,
447
- "grad_norm": 0.28751009702682495,
448
  "learning_rate": 1e-06,
449
- "loss": 0.5915,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.25472636815920396,
454
- "grad_norm": 0.2818026542663574,
455
  "learning_rate": 1e-06,
456
- "loss": 0.5793,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.25870646766169153,
461
- "grad_norm": 0.29114875197410583,
462
  "learning_rate": 1e-06,
463
- "loss": 0.5577,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2626865671641791,
468
- "grad_norm": 0.3001895546913147,
469
  "learning_rate": 1e-06,
470
- "loss": 0.5792,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.26666666666666666,
475
- "grad_norm": 0.28489118814468384,
476
  "learning_rate": 1e-06,
477
- "loss": 0.6217,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.27064676616915423,
482
- "grad_norm": 0.27548784017562866,
483
  "learning_rate": 1e-06,
484
- "loss": 0.603,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.2746268656716418,
489
- "grad_norm": 0.2983139455318451,
490
  "learning_rate": 1e-06,
491
- "loss": 0.6069,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.27860696517412936,
496
- "grad_norm": 0.2885805070400238,
497
  "learning_rate": 1e-06,
498
- "loss": 0.6058,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.28258706467661693,
503
- "grad_norm": 0.28651854395866394,
504
  "learning_rate": 1e-06,
505
- "loss": 0.5814,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.2865671641791045,
510
- "grad_norm": 0.2910130023956299,
511
  "learning_rate": 1e-06,
512
- "loss": 0.6039,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.29054726368159206,
517
- "grad_norm": 0.2883201241493225,
518
  "learning_rate": 1e-06,
519
- "loss": 0.586,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.2945273631840796,
524
- "grad_norm": 0.27827897667884827,
525
  "learning_rate": 1e-06,
526
- "loss": 0.5844,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.29850746268656714,
531
- "grad_norm": 0.2674331068992615,
532
  "learning_rate": 1e-06,
533
- "loss": 0.5966,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3024875621890547,
538
- "grad_norm": 0.27721738815307617,
539
  "learning_rate": 1e-06,
540
- "loss": 0.5651,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.30646766169154227,
545
- "grad_norm": 0.29553672671318054,
546
  "learning_rate": 1e-06,
547
- "loss": 0.5578,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.31044776119402984,
552
- "grad_norm": 0.27353787422180176,
553
  "learning_rate": 1e-06,
554
- "loss": 0.5778,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.3144278606965174,
559
- "grad_norm": 0.2708923816680908,
560
  "learning_rate": 1e-06,
561
- "loss": 0.5637,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.31840796019900497,
566
- "grad_norm": 0.2771095931529999,
567
  "learning_rate": 1e-06,
568
- "loss": 0.5421,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.32238805970149254,
573
- "grad_norm": 0.28794559836387634,
574
  "learning_rate": 1e-06,
575
- "loss": 0.56,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.3263681592039801,
580
- "grad_norm": 0.27953365445137024,
581
  "learning_rate": 1e-06,
582
- "loss": 0.5943,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.33034825870646767,
587
- "grad_norm": 0.2918912470340729,
588
  "learning_rate": 1e-06,
589
- "loss": 0.5797,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.33432835820895523,
594
- "grad_norm": 0.29445740580558777,
595
  "learning_rate": 1e-06,
596
- "loss": 0.5675,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3383084577114428,
601
- "grad_norm": 0.2901161313056946,
602
  "learning_rate": 1e-06,
603
- "loss": 0.5775,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.34228855721393037,
608
- "grad_norm": 0.27226191759109497,
609
  "learning_rate": 1e-06,
610
- "loss": 0.5638,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.34626865671641793,
615
- "grad_norm": 0.28128597140312195,
616
  "learning_rate": 1e-06,
617
- "loss": 0.5591,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3502487562189055,
622
- "grad_norm": 0.2813471853733063,
623
  "learning_rate": 1e-06,
624
- "loss": 0.5989,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.354228855721393,
629
- "grad_norm": 0.2899133265018463,
630
  "learning_rate": 1e-06,
631
- "loss": 0.584,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.3582089552238806,
636
- "grad_norm": 0.2919646203517914,
637
  "learning_rate": 1e-06,
638
- "loss": 0.5764,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.36218905472636814,
643
- "grad_norm": 0.2885926365852356,
644
  "learning_rate": 1e-06,
645
- "loss": 0.5623,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3661691542288557,
650
- "grad_norm": 0.28255367279052734,
651
  "learning_rate": 1e-06,
652
- "loss": 0.6061,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.3701492537313433,
657
- "grad_norm": 0.2776722013950348,
658
  "learning_rate": 1e-06,
659
- "loss": 0.588,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.37412935323383084,
664
- "grad_norm": 0.3004148304462433,
665
  "learning_rate": 1e-06,
666
- "loss": 0.6002,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.3781094527363184,
671
- "grad_norm": 0.2883853316307068,
672
  "learning_rate": 1e-06,
673
- "loss": 0.5886,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.382089552238806,
678
- "grad_norm": 0.2858606278896332,
679
  "learning_rate": 1e-06,
680
- "loss": 0.546,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.38606965174129354,
685
- "grad_norm": 0.30112016201019287,
686
  "learning_rate": 1e-06,
687
- "loss": 0.5814,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.3900497512437811,
692
- "grad_norm": 0.2831226587295532,
693
  "learning_rate": 1e-06,
694
- "loss": 0.5411,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.3940298507462687,
699
- "grad_norm": 0.3117291331291199,
700
  "learning_rate": 1e-06,
701
- "loss": 0.6567,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.39800995024875624,
706
- "grad_norm": 0.2813672125339508,
707
  "learning_rate": 1e-06,
708
- "loss": 0.5674,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4019900497512438,
713
- "grad_norm": 0.2731095850467682,
714
  "learning_rate": 1e-06,
715
- "loss": 0.5819,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.4059701492537313,
720
- "grad_norm": 0.29545432329177856,
721
  "learning_rate": 1e-06,
722
- "loss": 0.5966,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.4099502487562189,
727
- "grad_norm": 0.26830869913101196,
728
  "learning_rate": 1e-06,
729
- "loss": 0.5747,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.41393034825870645,
734
- "grad_norm": 0.30151620507240295,
735
  "learning_rate": 1e-06,
736
- "loss": 0.6733,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.417910447761194,
741
- "grad_norm": 0.2833845317363739,
742
  "learning_rate": 1e-06,
743
- "loss": 0.595,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.4218905472636816,
748
- "grad_norm": 0.27560508251190186,
749
  "learning_rate": 1e-06,
750
- "loss": 0.5554,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.42587064676616915,
755
- "grad_norm": 0.3009320795536041,
756
  "learning_rate": 1e-06,
757
- "loss": 0.5698,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.4298507462686567,
762
- "grad_norm": 0.2834017872810364,
763
  "learning_rate": 1e-06,
764
- "loss": 0.5904,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4338308457711443,
769
- "grad_norm": 0.27971693873405457,
770
  "learning_rate": 1e-06,
771
- "loss": 0.5555,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.43781094527363185,
776
- "grad_norm": 0.27217191457748413,
777
  "learning_rate": 1e-06,
778
- "loss": 0.5594,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.4417910447761194,
783
- "grad_norm": 0.28083258867263794,
784
  "learning_rate": 1e-06,
785
- "loss": 0.5766,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.445771144278607,
790
- "grad_norm": 0.29860496520996094,
791
  "learning_rate": 1e-06,
792
- "loss": 0.5622,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.44975124378109455,
797
- "grad_norm": 0.2839198410511017,
798
  "learning_rate": 1e-06,
799
- "loss": 0.5441,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4537313432835821,
804
- "grad_norm": 0.28053733706474304,
805
  "learning_rate": 1e-06,
806
- "loss": 0.545,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.4577114427860697,
811
- "grad_norm": 0.28944674134254456,
812
  "learning_rate": 1e-06,
813
- "loss": 0.5414,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4616915422885572,
818
- "grad_norm": 1.7277145385742188,
819
  "learning_rate": 1e-06,
820
- "loss": 0.5376,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.46567164179104475,
825
- "grad_norm": 0.26408037543296814,
826
  "learning_rate": 1e-06,
827
- "loss": 0.5273,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.4696517412935323,
832
- "grad_norm": 0.2752501666545868,
833
  "learning_rate": 1e-06,
834
- "loss": 0.5223,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.4736318407960199,
839
- "grad_norm": 0.31200143694877625,
840
  "learning_rate": 1e-06,
841
- "loss": 0.6251,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.47761194029850745,
846
- "grad_norm": 0.2889968156814575,
847
  "learning_rate": 1e-06,
848
- "loss": 0.548,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.481592039800995,
853
- "grad_norm": 0.272776335477829,
854
  "learning_rate": 1e-06,
855
- "loss": 0.5353,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.4855721393034826,
860
- "grad_norm": 0.29524046182632446,
861
  "learning_rate": 1e-06,
862
- "loss": 0.5834,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.48955223880597015,
867
- "grad_norm": 0.2750682830810547,
868
  "learning_rate": 1e-06,
869
- "loss": 0.5769,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.4935323383084577,
874
- "grad_norm": 0.28290194272994995,
875
  "learning_rate": 1e-06,
876
- "loss": 0.5749,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.4975124378109453,
881
- "grad_norm": 0.2784881889820099,
882
  "learning_rate": 1e-06,
883
- "loss": 0.5675,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5014925373134328,
888
- "grad_norm": 0.28352829813957214,
889
  "learning_rate": 1e-06,
890
- "loss": 0.5544,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5054726368159204,
895
- "grad_norm": 0.4005744457244873,
896
  "learning_rate": 1e-06,
897
- "loss": 0.4916,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5094527363184079,
902
- "grad_norm": 0.2907276153564453,
903
  "learning_rate": 1e-06,
904
- "loss": 0.5842,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5134328358208955,
909
- "grad_norm": 0.27371498942375183,
910
  "learning_rate": 1e-06,
911
- "loss": 0.5298,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5174129353233831,
916
- "grad_norm": 0.268046110868454,
917
  "learning_rate": 1e-06,
918
- "loss": 0.5488,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5213930348258706,
923
- "grad_norm": 0.27211833000183105,
924
  "learning_rate": 1e-06,
925
- "loss": 0.548,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5253731343283582,
930
- "grad_norm": 0.28055205941200256,
931
  "learning_rate": 1e-06,
932
- "loss": 0.5506,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5293532338308458,
937
- "grad_norm": 0.28549808263778687,
938
  "learning_rate": 1e-06,
939
- "loss": 0.5514,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.5333333333333333,
944
- "grad_norm": 0.2873031198978424,
945
  "learning_rate": 1e-06,
946
- "loss": 0.5868,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5373134328358209,
951
- "grad_norm": 0.26007169485092163,
952
  "learning_rate": 1e-06,
953
- "loss": 0.4835,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5412935323383085,
958
- "grad_norm": 0.27581357955932617,
959
  "learning_rate": 1e-06,
960
- "loss": 0.515,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.545273631840796,
965
- "grad_norm": 0.2559061050415039,
966
  "learning_rate": 1e-06,
967
- "loss": 0.4964,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.5492537313432836,
972
- "grad_norm": 0.26830771565437317,
973
  "learning_rate": 1e-06,
974
- "loss": 0.5285,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5532338308457712,
979
- "grad_norm": 0.2840443253517151,
980
  "learning_rate": 1e-06,
981
- "loss": 0.5135,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.5572139303482587,
986
- "grad_norm": 0.27029529213905334,
987
  "learning_rate": 1e-06,
988
- "loss": 0.5273,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.5611940298507463,
993
- "grad_norm": 0.2841308116912842,
994
  "learning_rate": 1e-06,
995
- "loss": 0.5804,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.5651741293532339,
1000
- "grad_norm": 0.28251802921295166,
1001
  "learning_rate": 1e-06,
1002
- "loss": 0.5554,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.5691542288557214,
1007
- "grad_norm": 0.2795189321041107,
1008
  "learning_rate": 1e-06,
1009
- "loss": 0.5299,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.573134328358209,
1014
- "grad_norm": 0.29494765400886536,
1015
  "learning_rate": 1e-06,
1016
- "loss": 0.5866,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.5771144278606966,
1021
- "grad_norm": 0.26426634192466736,
1022
  "learning_rate": 1e-06,
1023
- "loss": 0.4921,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.5810945273631841,
1028
- "grad_norm": 0.27161064743995667,
1029
  "learning_rate": 1e-06,
1030
- "loss": 0.5156,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.5850746268656717,
1035
- "grad_norm": 0.2546272277832031,
1036
  "learning_rate": 1e-06,
1037
- "loss": 0.4764,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.5890547263681593,
1042
- "grad_norm": 0.26822739839553833,
1043
  "learning_rate": 1e-06,
1044
- "loss": 0.5317,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.5930348258706468,
1049
- "grad_norm": 0.28637799620628357,
1050
  "learning_rate": 1e-06,
1051
- "loss": 0.5488,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.5970149253731343,
1056
- "grad_norm": 0.29014742374420166,
1057
  "learning_rate": 1e-06,
1058
- "loss": 0.5567,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.6009950248756218,
1063
- "grad_norm": 0.2683526873588562,
1064
  "learning_rate": 1e-06,
1065
- "loss": 0.5511,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6049751243781094,
1070
- "grad_norm": 0.27193310856819153,
1071
  "learning_rate": 1e-06,
1072
- "loss": 0.5253,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.608955223880597,
1077
- "grad_norm": 0.808740496635437,
1078
  "learning_rate": 1e-06,
1079
- "loss": 0.5254,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6129353233830845,
1084
- "grad_norm": 0.2881057858467102,
1085
  "learning_rate": 1e-06,
1086
- "loss": 0.5668,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6169154228855721,
1091
- "grad_norm": 0.28654593229293823,
1092
  "learning_rate": 1e-06,
1093
- "loss": 0.6033,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6208955223880597,
1098
- "grad_norm": 0.29203689098358154,
1099
  "learning_rate": 1e-06,
1100
- "loss": 0.5548,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6248756218905472,
1105
- "grad_norm": 0.2731221318244934,
1106
  "learning_rate": 1e-06,
1107
- "loss": 0.4972,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6288557213930348,
1112
- "grad_norm": 0.27775096893310547,
1113
  "learning_rate": 1e-06,
1114
- "loss": 0.4988,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6328358208955224,
1119
- "grad_norm": 0.2725508511066437,
1120
  "learning_rate": 1e-06,
1121
- "loss": 0.5338,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6368159203980099,
1126
- "grad_norm": 0.2905254364013672,
1127
  "learning_rate": 1e-06,
1128
- "loss": 0.5502,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6407960199004975,
1133
- "grad_norm": 0.2800814211368561,
1134
  "learning_rate": 1e-06,
1135
- "loss": 0.524,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6447761194029851,
1140
- "grad_norm": 0.29800140857696533,
1141
  "learning_rate": 1e-06,
1142
- "loss": 0.5658,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6487562189054726,
1147
- "grad_norm": 0.289701372385025,
1148
  "learning_rate": 1e-06,
1149
- "loss": 0.5322,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.6527363184079602,
1154
- "grad_norm": 0.3027022182941437,
1155
  "learning_rate": 1e-06,
1156
- "loss": 0.5575,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.6567164179104478,
1161
- "grad_norm": 0.29252082109451294,
1162
  "learning_rate": 1e-06,
1163
- "loss": 0.559,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.6606965174129353,
1168
- "grad_norm": 0.2698836326599121,
1169
  "learning_rate": 1e-06,
1170
- "loss": 0.502,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.6646766169154229,
1175
- "grad_norm": 0.27977052330970764,
1176
  "learning_rate": 1e-06,
1177
- "loss": 0.483,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.6686567164179105,
1182
- "grad_norm": 0.2937949001789093,
1183
  "learning_rate": 1e-06,
1184
- "loss": 0.5613,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.672636815920398,
1189
- "grad_norm": 0.2905248701572418,
1190
  "learning_rate": 1e-06,
1191
- "loss": 0.5369,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.6766169154228856,
1196
- "grad_norm": 0.27426132559776306,
1197
  "learning_rate": 1e-06,
1198
- "loss": 0.4985,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.6805970149253732,
1203
- "grad_norm": 0.2826381325721741,
1204
  "learning_rate": 1e-06,
1205
- "loss": 0.525,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.6845771144278607,
1210
- "grad_norm": 0.2896779477596283,
1211
  "learning_rate": 1e-06,
1212
- "loss": 0.5503,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.6885572139303483,
1217
- "grad_norm": 0.27713751792907715,
1218
  "learning_rate": 1e-06,
1219
- "loss": 0.5198,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.6925373134328359,
1224
- "grad_norm": 0.29340362548828125,
1225
  "learning_rate": 1e-06,
1226
- "loss": 0.5588,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.6965174129353234,
1231
- "grad_norm": 0.26327288150787354,
1232
  "learning_rate": 1e-06,
1233
- "loss": 0.5044,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.700497512437811,
1238
- "grad_norm": 0.2810980975627899,
1239
  "learning_rate": 1e-06,
1240
- "loss": 0.5336,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.7044776119402985,
1245
- "grad_norm": 0.2798118591308594,
1246
  "learning_rate": 1e-06,
1247
- "loss": 0.5623,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.708457711442786,
1252
- "grad_norm": 0.27893081307411194,
1253
  "learning_rate": 1e-06,
1254
- "loss": 0.5098,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7124378109452736,
1259
- "grad_norm": 0.2879588305950165,
1260
  "learning_rate": 1e-06,
1261
- "loss": 0.5581,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7164179104477612,
1266
- "grad_norm": 0.2735341787338257,
1267
  "learning_rate": 1e-06,
1268
- "loss": 0.4972,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.7203980099502487,
1273
- "grad_norm": 0.28305062651634216,
1274
  "learning_rate": 1e-06,
1275
- "loss": 0.5198,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7243781094527363,
1280
- "grad_norm": 0.2881869375705719,
1281
  "learning_rate": 1e-06,
1282
- "loss": 0.5236,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7283582089552239,
1287
- "grad_norm": 0.30144739151000977,
1288
  "learning_rate": 1e-06,
1289
- "loss": 0.5406,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7323383084577114,
1294
- "grad_norm": 0.28926968574523926,
1295
  "learning_rate": 1e-06,
1296
- "loss": 0.5571,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.736318407960199,
1301
- "grad_norm": 0.29733872413635254,
1302
  "learning_rate": 1e-06,
1303
- "loss": 0.6002,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7402985074626866,
1308
- "grad_norm": 0.28750744462013245,
1309
  "learning_rate": 1e-06,
1310
- "loss": 0.5629,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.7442786069651741,
1315
- "grad_norm": 0.25272336602211,
1316
  "learning_rate": 1e-06,
1317
- "loss": 0.5315,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.7482587064676617,
1322
- "grad_norm": 0.3123670220375061,
1323
  "learning_rate": 1e-06,
1324
- "loss": 0.5518,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.7522388059701492,
1329
- "grad_norm": 0.287804514169693,
1330
  "learning_rate": 1e-06,
1331
- "loss": 0.5308,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.7562189054726368,
1336
- "grad_norm": 0.27801209688186646,
1337
  "learning_rate": 1e-06,
1338
- "loss": 0.4952,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.7601990049751244,
1343
- "grad_norm": 0.29395267367362976,
1344
  "learning_rate": 1e-06,
1345
- "loss": 0.5072,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.764179104477612,
1350
- "grad_norm": 0.29356127977371216,
1351
  "learning_rate": 1e-06,
1352
- "loss": 0.5451,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.7681592039800995,
1357
- "grad_norm": 0.27663421630859375,
1358
  "learning_rate": 1e-06,
1359
- "loss": 0.5338,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.7721393034825871,
1364
- "grad_norm": 0.27448275685310364,
1365
  "learning_rate": 1e-06,
1366
- "loss": 0.5382,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.7761194029850746,
1371
- "grad_norm": 0.2774457037448883,
1372
  "learning_rate": 1e-06,
1373
- "loss": 0.4961,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.7800995024875622,
1378
- "grad_norm": 0.30790749192237854,
1379
  "learning_rate": 1e-06,
1380
- "loss": 0.5553,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.7840796019900498,
1385
- "grad_norm": 0.30943363904953003,
1386
  "learning_rate": 1e-06,
1387
- "loss": 0.5514,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.7880597014925373,
1392
- "grad_norm": 0.265715092420578,
1393
  "learning_rate": 1e-06,
1394
- "loss": 0.494,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.7920398009950249,
1399
- "grad_norm": 0.28460168838500977,
1400
  "learning_rate": 1e-06,
1401
- "loss": 0.5318,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.7960199004975125,
1406
- "grad_norm": 0.2925533354282379,
1407
  "learning_rate": 1e-06,
1408
- "loss": 0.5197,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8,
1413
- "grad_norm": 0.2781723141670227,
1414
  "learning_rate": 1e-06,
1415
- "loss": 0.4839,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8039800995024876,
1420
- "grad_norm": 0.28367018699645996,
1421
  "learning_rate": 1e-06,
1422
- "loss": 0.5533,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.8079601990049752,
1427
- "grad_norm": 0.2904638350009918,
1428
  "learning_rate": 1e-06,
1429
- "loss": 0.5128,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8119402985074626,
1434
- "grad_norm": 0.2869066596031189,
1435
  "learning_rate": 1e-06,
1436
- "loss": 0.5842,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8159203980099502,
1441
- "grad_norm": 0.2981327176094055,
1442
  "learning_rate": 1e-06,
1443
- "loss": 0.6124,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8199004975124378,
1448
- "grad_norm": 0.3040124177932739,
1449
  "learning_rate": 1e-06,
1450
- "loss": 0.5407,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8238805970149253,
1455
- "grad_norm": 0.283186674118042,
1456
  "learning_rate": 1e-06,
1457
- "loss": 0.5559,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8278606965174129,
1462
- "grad_norm": 0.29206421971321106,
1463
  "learning_rate": 1e-06,
1464
- "loss": 0.5143,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8318407960199005,
1469
- "grad_norm": 0.2698039412498474,
1470
  "learning_rate": 1e-06,
1471
- "loss": 0.5092,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.835820895522388,
1476
- "grad_norm": 0.3050399720668793,
1477
  "learning_rate": 1e-06,
1478
- "loss": 0.5436,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.8398009950248756,
1483
- "grad_norm": 0.2690124809741974,
1484
  "learning_rate": 1e-06,
1485
- "loss": 0.5118,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.8437810945273632,
1490
- "grad_norm": 0.2941598892211914,
1491
  "learning_rate": 1e-06,
1492
- "loss": 0.5776,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.8477611940298507,
1497
- "grad_norm": 0.267484188079834,
1498
  "learning_rate": 1e-06,
1499
- "loss": 0.481,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.8517412935323383,
1504
- "grad_norm": 0.3034264147281647,
1505
  "learning_rate": 1e-06,
1506
- "loss": 0.5479,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.8557213930348259,
1511
- "grad_norm": 0.29359570145606995,
1512
  "learning_rate": 1e-06,
1513
- "loss": 0.5369,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.8597014925373134,
1518
- "grad_norm": 0.2907046377658844,
1519
  "learning_rate": 1e-06,
1520
- "loss": 0.5127,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.863681592039801,
1525
- "grad_norm": 0.2787851095199585,
1526
  "learning_rate": 1e-06,
1527
- "loss": 0.5398,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.8676616915422886,
1532
- "grad_norm": 0.29438599944114685,
1533
  "learning_rate": 1e-06,
1534
- "loss": 0.5337,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.8716417910447761,
1539
- "grad_norm": 0.2769269645214081,
1540
  "learning_rate": 1e-06,
1541
- "loss": 0.5016,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.8756218905472637,
1546
- "grad_norm": 0.27982795238494873,
1547
  "learning_rate": 1e-06,
1548
- "loss": 0.5489,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.8796019900497513,
1553
- "grad_norm": 0.2620881497859955,
1554
  "learning_rate": 1e-06,
1555
- "loss": 0.4893,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.8835820895522388,
1560
- "grad_norm": 0.2869341969490051,
1561
  "learning_rate": 1e-06,
1562
- "loss": 0.5365,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.8875621890547264,
1567
- "grad_norm": 0.28541088104248047,
1568
  "learning_rate": 1e-06,
1569
- "loss": 0.5234,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.891542288557214,
1574
- "grad_norm": 0.2907220125198364,
1575
  "learning_rate": 1e-06,
1576
- "loss": 0.5224,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.8955223880597015,
1581
- "grad_norm": 0.3106067180633545,
1582
  "learning_rate": 1e-06,
1583
- "loss": 0.5616,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.8995024875621891,
1588
- "grad_norm": 0.2765253782272339,
1589
  "learning_rate": 1e-06,
1590
- "loss": 0.4978,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9034825870646767,
1595
- "grad_norm": 0.2780396342277527,
1596
  "learning_rate": 1e-06,
1597
- "loss": 0.5197,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9074626865671642,
1602
- "grad_norm": 0.2735743224620819,
1603
  "learning_rate": 1e-06,
1604
- "loss": 0.5081,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9114427860696518,
1609
- "grad_norm": 0.2986888289451599,
1610
  "learning_rate": 1e-06,
1611
- "loss": 0.504,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9154228855721394,
1616
- "grad_norm": 0.2711998522281647,
1617
  "learning_rate": 1e-06,
1618
- "loss": 0.5258,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9194029850746268,
1623
- "grad_norm": 0.27429237961769104,
1624
  "learning_rate": 1e-06,
1625
- "loss": 0.4983,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9233830845771144,
1630
- "grad_norm": 0.28108328580856323,
1631
  "learning_rate": 1e-06,
1632
- "loss": 0.5817,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 0.9273631840796019,
1637
- "grad_norm": 0.273513525724411,
1638
  "learning_rate": 1e-06,
1639
- "loss": 0.5024,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 0.9313432835820895,
1644
- "grad_norm": 0.2856132686138153,
1645
  "learning_rate": 1e-06,
1646
- "loss": 0.5257,
1647
  "step": 234
1648
  },
1649
  {
1650
  "epoch": 0.9353233830845771,
1651
- "grad_norm": 0.2727264165878296,
1652
  "learning_rate": 1e-06,
1653
- "loss": 0.4796,
1654
  "step": 235
1655
  },
1656
  {
1657
  "epoch": 0.9393034825870646,
1658
- "grad_norm": 0.2819795608520508,
1659
  "learning_rate": 1e-06,
1660
- "loss": 0.4993,
1661
  "step": 236
1662
  },
1663
  {
1664
  "epoch": 0.9432835820895522,
1665
- "grad_norm": 0.29131144285202026,
1666
  "learning_rate": 1e-06,
1667
- "loss": 0.492,
1668
  "step": 237
1669
  },
1670
  {
1671
  "epoch": 0.9472636815920398,
1672
- "grad_norm": 0.29098305106163025,
1673
  "learning_rate": 1e-06,
1674
- "loss": 0.5257,
1675
  "step": 238
1676
  },
1677
  {
1678
  "epoch": 0.9512437810945273,
1679
- "grad_norm": 0.2734336853027344,
1680
  "learning_rate": 1e-06,
1681
- "loss": 0.487,
1682
  "step": 239
1683
  },
1684
  {
1685
  "epoch": 0.9552238805970149,
1686
- "grad_norm": 0.26648443937301636,
1687
  "learning_rate": 1e-06,
1688
- "loss": 0.4864,
1689
  "step": 240
1690
  },
1691
  {
1692
  "epoch": 0.9592039800995025,
1693
- "grad_norm": 0.2583979666233063,
1694
  "learning_rate": 1e-06,
1695
- "loss": 0.4622,
1696
  "step": 241
1697
  },
1698
  {
1699
  "epoch": 0.96318407960199,
1700
- "grad_norm": 0.26614758372306824,
1701
  "learning_rate": 1e-06,
1702
- "loss": 0.5096,
1703
  "step": 242
1704
  },
1705
  {
1706
  "epoch": 0.9671641791044776,
1707
- "grad_norm": 0.25741949677467346,
1708
  "learning_rate": 1e-06,
1709
- "loss": 0.4801,
1710
  "step": 243
1711
  },
1712
  {
1713
  "epoch": 0.9711442786069652,
1714
- "grad_norm": 0.2788185477256775,
1715
  "learning_rate": 1e-06,
1716
- "loss": 0.4905,
1717
  "step": 244
1718
  },
1719
  {
1720
  "epoch": 0.9751243781094527,
1721
- "grad_norm": 0.282296746969223,
1722
  "learning_rate": 1e-06,
1723
- "loss": 0.5223,
1724
  "step": 245
1725
  },
1726
  {
1727
  "epoch": 0.9791044776119403,
1728
- "grad_norm": 0.2750173509120941,
1729
  "learning_rate": 1e-06,
1730
- "loss": 0.5051,
1731
  "step": 246
1732
  },
1733
  {
1734
  "epoch": 0.9830845771144279,
1735
- "grad_norm": 0.2807095944881439,
1736
  "learning_rate": 1e-06,
1737
- "loss": 0.503,
1738
  "step": 247
1739
  },
1740
  {
1741
  "epoch": 0.9870646766169154,
1742
- "grad_norm": 0.2665058970451355,
1743
  "learning_rate": 1e-06,
1744
- "loss": 0.4514,
1745
  "step": 248
1746
  },
1747
  {
1748
  "epoch": 0.991044776119403,
1749
- "grad_norm": 0.26747071743011475,
1750
  "learning_rate": 1e-06,
1751
- "loss": 0.4601,
1752
  "step": 249
1753
  },
1754
  {
1755
  "epoch": 0.9950248756218906,
1756
- "grad_norm": 0.2884337306022644,
1757
  "learning_rate": 1e-06,
1758
- "loss": 0.4899,
1759
  "step": 250
1760
  },
1761
  {
1762
  "epoch": 0.9990049751243781,
1763
- "grad_norm": 0.29180482029914856,
1764
  "learning_rate": 1e-06,
1765
- "loss": 0.4998,
1766
  "step": 251
1767
  },
1768
  {
1769
  "epoch": 0.9990049751243781,
1770
  "step": 251,
1771
- "total_flos": 5.849427398046515e+16,
1772
- "train_loss": 0.6000438257755036,
1773
- "train_runtime": 5544.176,
1774
- "train_samples_per_second": 2.175,
1775
- "train_steps_per_second": 0.045
1776
  }
1777
  ],
1778
  "logging_steps": 1,
@@ -1792,7 +1792,7 @@
1792
  "attributes": {}
1793
  }
1794
  },
1795
- "total_flos": 5.849427398046515e+16,
1796
  "train_batch_size": 1,
1797
  "trial_name": null,
1798
  "trial_params": null
 
10
  "log_history": [
11
  {
12
  "epoch": 0.003980099502487562,
13
+ "grad_norm": 8.602572441101074,
14
  "learning_rate": 1e-06,
15
+ "loss": 1.2197,
16
  "step": 1
17
  },
18
  {
19
  "epoch": 0.007960199004975124,
20
+ "grad_norm": 9.213591575622559,
21
  "learning_rate": 1e-06,
22
+ "loss": 1.132,
23
  "step": 2
24
  },
25
  {
26
  "epoch": 0.011940298507462687,
27
+ "grad_norm": 169.98960876464844,
28
  "learning_rate": 1e-06,
29
+ "loss": 1.118,
30
  "step": 3
31
  },
32
  {
33
  "epoch": 0.015920398009950248,
34
+ "grad_norm": 11.449417114257812,
35
  "learning_rate": 1e-06,
36
+ "loss": 1.2842,
37
  "step": 4
38
  },
39
  {
40
  "epoch": 0.01990049751243781,
41
+ "grad_norm": 16.207637786865234,
42
  "learning_rate": 1e-06,
43
+ "loss": 1.1521,
44
  "step": 5
45
  },
46
  {
47
  "epoch": 0.023880597014925373,
48
+ "grad_norm": 14.257845878601074,
49
  "learning_rate": 1e-06,
50
+ "loss": 1.2443,
51
  "step": 6
52
  },
53
  {
54
  "epoch": 0.027860696517412936,
55
+ "grad_norm": 10.684178352355957,
56
  "learning_rate": 1e-06,
57
+ "loss": 1.1476,
58
  "step": 7
59
  },
60
  {
61
  "epoch": 0.031840796019900496,
62
+ "grad_norm": 10.902571678161621,
63
  "learning_rate": 1e-06,
64
+ "loss": 1.1492,
65
  "step": 8
66
  },
67
  {
68
  "epoch": 0.03582089552238806,
69
+ "grad_norm": 31.9368839263916,
70
  "learning_rate": 1e-06,
71
+ "loss": 1.2449,
72
  "step": 9
73
  },
74
  {
75
  "epoch": 0.03980099502487562,
76
+ "grad_norm": 29.409448623657227,
77
  "learning_rate": 1e-06,
78
+ "loss": 0.9893,
79
  "step": 10
80
  },
81
  {
82
  "epoch": 0.04378109452736319,
83
+ "grad_norm": 35.553062438964844,
84
  "learning_rate": 1e-06,
85
+ "loss": 0.9807,
86
  "step": 11
87
  },
88
  {
89
  "epoch": 0.04776119402985075,
90
+ "grad_norm": 17.210281372070312,
91
  "learning_rate": 1e-06,
92
+ "loss": 0.9567,
93
  "step": 12
94
  },
95
  {
96
  "epoch": 0.051741293532338306,
97
+ "grad_norm": 18.666950225830078,
98
  "learning_rate": 1e-06,
99
+ "loss": 0.8209,
100
  "step": 13
101
  },
102
  {
103
  "epoch": 0.05572139303482587,
104
+ "grad_norm": 11.282356262207031,
105
  "learning_rate": 1e-06,
106
+ "loss": 0.9215,
107
  "step": 14
108
  },
109
  {
110
  "epoch": 0.05970149253731343,
111
+ "grad_norm": 36.11897659301758,
112
  "learning_rate": 1e-06,
113
+ "loss": 0.8758,
114
  "step": 15
115
  },
116
  {
117
  "epoch": 0.06368159203980099,
118
+ "grad_norm": 8.626749038696289,
119
  "learning_rate": 1e-06,
120
+ "loss": 0.8582,
121
  "step": 16
122
  },
123
  {
124
  "epoch": 0.06766169154228856,
125
+ "grad_norm": 11.078080177307129,
126
  "learning_rate": 1e-06,
127
+ "loss": 0.8324,
128
  "step": 17
129
  },
130
  {
131
  "epoch": 0.07164179104477612,
132
+ "grad_norm": 8.919255256652832,
133
  "learning_rate": 1e-06,
134
+ "loss": 0.832,
135
  "step": 18
136
  },
137
  {
138
  "epoch": 0.07562189054726368,
139
+ "grad_norm": 2.5705490112304688,
140
  "learning_rate": 1e-06,
141
+ "loss": 0.7914,
142
  "step": 19
143
  },
144
  {
145
  "epoch": 0.07960199004975124,
146
+ "grad_norm": 3.116955280303955,
147
  "learning_rate": 1e-06,
148
+ "loss": 0.7534,
149
  "step": 20
150
  },
151
  {
152
  "epoch": 0.08358208955223881,
153
+ "grad_norm": 3.8580944538116455,
154
  "learning_rate": 1e-06,
155
+ "loss": 0.7618,
156
  "step": 21
157
  },
158
  {
159
  "epoch": 0.08756218905472637,
160
+ "grad_norm": 3.251260995864868,
161
  "learning_rate": 1e-06,
162
+ "loss": 0.7007,
163
  "step": 22
164
  },
165
  {
166
  "epoch": 0.09154228855721393,
167
+ "grad_norm": 3.630941152572632,
168
  "learning_rate": 1e-06,
169
+ "loss": 0.6953,
170
  "step": 23
171
  },
172
  {
173
  "epoch": 0.0955223880597015,
174
+ "grad_norm": 3.7083191871643066,
175
  "learning_rate": 1e-06,
176
+ "loss": 0.6851,
177
  "step": 24
178
  },
179
  {
180
  "epoch": 0.09950248756218906,
181
+ "grad_norm": 3.5167739391326904,
182
  "learning_rate": 1e-06,
183
+ "loss": 0.6843,
184
  "step": 25
185
  },
186
  {
187
  "epoch": 0.10348258706467661,
188
+ "grad_norm": 2.4625730514526367,
189
  "learning_rate": 1e-06,
190
+ "loss": 0.6154,
191
  "step": 26
192
  },
193
  {
194
  "epoch": 0.10746268656716418,
195
+ "grad_norm": 2.900531768798828,
196
  "learning_rate": 1e-06,
197
+ "loss": 0.6588,
198
  "step": 27
199
  },
200
  {
201
  "epoch": 0.11144278606965174,
202
+ "grad_norm": 2.720223903656006,
203
  "learning_rate": 1e-06,
204
+ "loss": 0.5933,
205
  "step": 28
206
  },
207
  {
208
  "epoch": 0.1154228855721393,
209
+ "grad_norm": 2.8841376304626465,
210
  "learning_rate": 1e-06,
211
+ "loss": 0.5398,
212
  "step": 29
213
  },
214
  {
215
  "epoch": 0.11940298507462686,
216
+ "grad_norm": 2.3240113258361816,
217
  "learning_rate": 1e-06,
218
+ "loss": 0.617,
219
  "step": 30
220
  },
221
  {
222
  "epoch": 0.12338308457711443,
223
+ "grad_norm": 1.750429630279541,
224
  "learning_rate": 1e-06,
225
+ "loss": 0.6119,
226
  "step": 31
227
  },
228
  {
229
  "epoch": 0.12736318407960198,
230
+ "grad_norm": 0.7355216145515442,
231
  "learning_rate": 1e-06,
232
+ "loss": 0.512,
233
  "step": 32
234
  },
235
  {
236
  "epoch": 0.13134328358208955,
237
+ "grad_norm": 1.8516645431518555,
238
  "learning_rate": 1e-06,
239
+ "loss": 0.5669,
240
  "step": 33
241
  },
242
  {
243
  "epoch": 0.13532338308457711,
244
+ "grad_norm": 1.235843539237976,
245
  "learning_rate": 1e-06,
246
+ "loss": 0.5609,
247
  "step": 34
248
  },
249
  {
250
  "epoch": 0.13930348258706468,
251
+ "grad_norm": 2.795116424560547,
252
  "learning_rate": 1e-06,
253
+ "loss": 0.5855,
254
  "step": 35
255
  },
256
  {
257
  "epoch": 0.14328358208955225,
258
+ "grad_norm": 1.3828765153884888,
259
  "learning_rate": 1e-06,
260
+ "loss": 0.5657,
261
  "step": 36
262
  },
263
  {
264
  "epoch": 0.1472636815920398,
265
+ "grad_norm": 0.7134882807731628,
266
  "learning_rate": 1e-06,
267
+ "loss": 0.5639,
268
  "step": 37
269
  },
270
  {
271
  "epoch": 0.15124378109452735,
272
+ "grad_norm": 0.5558878183364868,
273
  "learning_rate": 1e-06,
274
+ "loss": 0.5444,
275
  "step": 38
276
  },
277
  {
278
  "epoch": 0.15522388059701492,
279
+ "grad_norm": 0.6173982620239258,
280
  "learning_rate": 1e-06,
281
+ "loss": 0.54,
282
  "step": 39
283
  },
284
  {
285
  "epoch": 0.15920398009950248,
286
+ "grad_norm": 0.49912330508232117,
287
  "learning_rate": 1e-06,
288
+ "loss": 0.5523,
289
  "step": 40
290
  },
291
  {
292
  "epoch": 0.16318407960199005,
293
+ "grad_norm": 0.4737468361854553,
294
  "learning_rate": 1e-06,
295
+ "loss": 0.5371,
296
  "step": 41
297
  },
298
  {
299
  "epoch": 0.16716417910447762,
300
+ "grad_norm": 0.4051644504070282,
301
  "learning_rate": 1e-06,
302
+ "loss": 0.5309,
303
  "step": 42
304
  },
305
  {
306
  "epoch": 0.17114427860696518,
307
+ "grad_norm": 0.42458516359329224,
308
  "learning_rate": 1e-06,
309
+ "loss": 0.5355,
310
  "step": 43
311
  },
312
  {
313
  "epoch": 0.17512437810945275,
314
+ "grad_norm": 0.3913373351097107,
315
  "learning_rate": 1e-06,
316
+ "loss": 0.5142,
317
  "step": 44
318
  },
319
  {
320
  "epoch": 0.1791044776119403,
321
+ "grad_norm": 0.3934544324874878,
322
  "learning_rate": 1e-06,
323
+ "loss": 0.5441,
324
  "step": 45
325
  },
326
  {
327
  "epoch": 0.18308457711442785,
328
+ "grad_norm": 0.4370742440223694,
329
  "learning_rate": 1e-06,
330
+ "loss": 0.5367,
331
  "step": 46
332
  },
333
  {
334
  "epoch": 0.18706467661691542,
335
+ "grad_norm": 0.4685991704463959,
336
  "learning_rate": 1e-06,
337
+ "loss": 0.5377,
338
  "step": 47
339
  },
340
  {
341
  "epoch": 0.191044776119403,
342
+ "grad_norm": 0.44145771861076355,
343
  "learning_rate": 1e-06,
344
+ "loss": 0.5221,
345
  "step": 48
346
  },
347
  {
348
  "epoch": 0.19502487562189055,
349
+ "grad_norm": 0.4267702102661133,
350
  "learning_rate": 1e-06,
351
+ "loss": 0.5303,
352
  "step": 49
353
  },
354
  {
355
  "epoch": 0.19900497512437812,
356
+ "grad_norm": 0.369733065366745,
357
  "learning_rate": 1e-06,
358
+ "loss": 0.4982,
359
  "step": 50
360
  },
361
  {
362
  "epoch": 0.20298507462686566,
363
+ "grad_norm": 0.3949680030345917,
364
  "learning_rate": 1e-06,
365
+ "loss": 0.5177,
366
  "step": 51
367
  },
368
  {
369
  "epoch": 0.20696517412935322,
370
+ "grad_norm": 0.4119485914707184,
371
  "learning_rate": 1e-06,
372
+ "loss": 0.5207,
373
  "step": 52
374
  },
375
  {
376
  "epoch": 0.2109452736318408,
377
+ "grad_norm": 0.3837582767009735,
378
  "learning_rate": 1e-06,
379
+ "loss": 0.5275,
380
  "step": 53
381
  },
382
  {
383
  "epoch": 0.21492537313432836,
384
+ "grad_norm": 0.36209946870803833,
385
  "learning_rate": 1e-06,
386
+ "loss": 0.4727,
387
  "step": 54
388
  },
389
  {
390
  "epoch": 0.21890547263681592,
391
+ "grad_norm": 0.38610896468162537,
392
  "learning_rate": 1e-06,
393
+ "loss": 0.4986,
394
  "step": 55
395
  },
396
  {
397
  "epoch": 0.2228855721393035,
398
+ "grad_norm": 0.45478999614715576,
399
  "learning_rate": 1e-06,
400
+ "loss": 0.5893,
401
  "step": 56
402
  },
403
  {
404
  "epoch": 0.22686567164179106,
405
+ "grad_norm": 0.4074576795101166,
406
  "learning_rate": 1e-06,
407
+ "loss": 0.5324,
408
  "step": 57
409
  },
410
  {
411
  "epoch": 0.2308457711442786,
412
+ "grad_norm": 0.5080429315567017,
413
  "learning_rate": 1e-06,
414
+ "loss": 0.5774,
415
  "step": 58
416
  },
417
  {
418
  "epoch": 0.23482587064676616,
419
+ "grad_norm": 0.40697044134140015,
420
  "learning_rate": 1e-06,
421
+ "loss": 0.5357,
422
  "step": 59
423
  },
424
  {
425
  "epoch": 0.23880597014925373,
426
+ "grad_norm": 0.40931448340415955,
427
  "learning_rate": 1e-06,
428
+ "loss": 0.5142,
429
  "step": 60
430
  },
431
  {
432
  "epoch": 0.2427860696517413,
433
+ "grad_norm": 0.38639310002326965,
434
  "learning_rate": 1e-06,
435
+ "loss": 0.4934,
436
  "step": 61
437
  },
438
  {
439
  "epoch": 0.24676616915422886,
440
+ "grad_norm": 0.3946526050567627,
441
  "learning_rate": 1e-06,
442
+ "loss": 0.5117,
443
  "step": 62
444
  },
445
  {
446
  "epoch": 0.2507462686567164,
447
+ "grad_norm": 0.4127659797668457,
448
  "learning_rate": 1e-06,
449
+ "loss": 0.5322,
450
  "step": 63
451
  },
452
  {
453
  "epoch": 0.25472636815920396,
454
+ "grad_norm": 0.39072689414024353,
455
  "learning_rate": 1e-06,
456
+ "loss": 0.5116,
457
  "step": 64
458
  },
459
  {
460
  "epoch": 0.25870646766169153,
461
+ "grad_norm": 0.4337158501148224,
462
  "learning_rate": 1e-06,
463
+ "loss": 0.4974,
464
  "step": 65
465
  },
466
  {
467
  "epoch": 0.2626865671641791,
468
+ "grad_norm": 0.4008353352546692,
469
  "learning_rate": 1e-06,
470
+ "loss": 0.5221,
471
  "step": 66
472
  },
473
  {
474
  "epoch": 0.26666666666666666,
475
+ "grad_norm": 0.3955201208591461,
476
  "learning_rate": 1e-06,
477
+ "loss": 0.5538,
478
  "step": 67
479
  },
480
  {
481
  "epoch": 0.27064676616915423,
482
+ "grad_norm": 0.3759704530239105,
483
  "learning_rate": 1e-06,
484
+ "loss": 0.5423,
485
  "step": 68
486
  },
487
  {
488
  "epoch": 0.2746268656716418,
489
+ "grad_norm": 0.4024805426597595,
490
  "learning_rate": 1e-06,
491
+ "loss": 0.542,
492
  "step": 69
493
  },
494
  {
495
  "epoch": 0.27860696517412936,
496
+ "grad_norm": 0.4321470856666565,
497
  "learning_rate": 1e-06,
498
+ "loss": 0.544,
499
  "step": 70
500
  },
501
  {
502
  "epoch": 0.28258706467661693,
503
+ "grad_norm": 0.41789472103118896,
504
  "learning_rate": 1e-06,
505
+ "loss": 0.521,
506
  "step": 71
507
  },
508
  {
509
  "epoch": 0.2865671641791045,
510
+ "grad_norm": 0.40374019742012024,
511
  "learning_rate": 1e-06,
512
+ "loss": 0.5361,
513
  "step": 72
514
  },
515
  {
516
  "epoch": 0.29054726368159206,
517
+ "grad_norm": 0.3968409299850464,
518
  "learning_rate": 1e-06,
519
+ "loss": 0.5189,
520
  "step": 73
521
  },
522
  {
523
  "epoch": 0.2945273631840796,
524
+ "grad_norm": 0.41135865449905396,
525
  "learning_rate": 1e-06,
526
+ "loss": 0.521,
527
  "step": 74
528
  },
529
  {
530
  "epoch": 0.29850746268656714,
531
+ "grad_norm": 0.37400493025779724,
532
  "learning_rate": 1e-06,
533
+ "loss": 0.5249,
534
  "step": 75
535
  },
536
  {
537
  "epoch": 0.3024875621890547,
538
+ "grad_norm": 0.39351746439933777,
539
  "learning_rate": 1e-06,
540
+ "loss": 0.5013,
541
  "step": 76
542
  },
543
  {
544
  "epoch": 0.30646766169154227,
545
+ "grad_norm": 0.409321665763855,
546
  "learning_rate": 1e-06,
547
+ "loss": 0.4912,
548
  "step": 77
549
  },
550
  {
551
  "epoch": 0.31044776119402984,
552
+ "grad_norm": 0.38681185245513916,
553
  "learning_rate": 1e-06,
554
+ "loss": 0.5099,
555
  "step": 78
556
  },
557
  {
558
  "epoch": 0.3144278606965174,
559
+ "grad_norm": 0.37752920389175415,
560
  "learning_rate": 1e-06,
561
+ "loss": 0.4987,
562
  "step": 79
563
  },
564
  {
565
  "epoch": 0.31840796019900497,
566
+ "grad_norm": 0.41034936904907227,
567
  "learning_rate": 1e-06,
568
+ "loss": 0.4803,
569
  "step": 80
570
  },
571
  {
572
  "epoch": 0.32238805970149254,
573
+ "grad_norm": 0.39453473687171936,
574
  "learning_rate": 1e-06,
575
+ "loss": 0.4932,
576
  "step": 81
577
  },
578
  {
579
  "epoch": 0.3263681592039801,
580
+ "grad_norm": 0.3872039020061493,
581
  "learning_rate": 1e-06,
582
+ "loss": 0.5196,
583
  "step": 82
584
  },
585
  {
586
  "epoch": 0.33034825870646767,
587
+ "grad_norm": 0.4377211332321167,
588
  "learning_rate": 1e-06,
589
+ "loss": 0.5017,
590
  "step": 83
591
  },
592
  {
593
  "epoch": 0.33432835820895523,
594
+ "grad_norm": 0.4183085560798645,
595
  "learning_rate": 1e-06,
596
+ "loss": 0.5048,
597
  "step": 84
598
  },
599
  {
600
  "epoch": 0.3383084577114428,
601
+ "grad_norm": 0.4024551510810852,
602
  "learning_rate": 1e-06,
603
+ "loss": 0.5096,
604
  "step": 85
605
  },
606
  {
607
  "epoch": 0.34228855721393037,
608
+ "grad_norm": 0.39065393805503845,
609
  "learning_rate": 1e-06,
610
+ "loss": 0.4971,
611
  "step": 86
612
  },
613
  {
614
  "epoch": 0.34626865671641793,
615
+ "grad_norm": 0.3872017562389374,
616
  "learning_rate": 1e-06,
617
+ "loss": 0.4991,
618
  "step": 87
619
  },
620
  {
621
  "epoch": 0.3502487562189055,
622
+ "grad_norm": 0.38857075572013855,
623
  "learning_rate": 1e-06,
624
+ "loss": 0.5274,
625
  "step": 88
626
  },
627
  {
628
  "epoch": 0.354228855721393,
629
+ "grad_norm": 0.3992158770561218,
630
  "learning_rate": 1e-06,
631
+ "loss": 0.5069,
632
  "step": 89
633
  },
634
  {
635
  "epoch": 0.3582089552238806,
636
+ "grad_norm": 0.4116052985191345,
637
  "learning_rate": 1e-06,
638
+ "loss": 0.5048,
639
  "step": 90
640
  },
641
  {
642
  "epoch": 0.36218905472636814,
643
+ "grad_norm": 0.3963039815425873,
644
  "learning_rate": 1e-06,
645
+ "loss": 0.488,
646
  "step": 91
647
  },
648
  {
649
  "epoch": 0.3661691542288557,
650
+ "grad_norm": 0.3977671265602112,
651
  "learning_rate": 1e-06,
652
+ "loss": 0.5362,
653
  "step": 92
654
  },
655
  {
656
  "epoch": 0.3701492537313433,
657
+ "grad_norm": 0.40321069955825806,
658
  "learning_rate": 1e-06,
659
+ "loss": 0.5116,
660
  "step": 93
661
  },
662
  {
663
  "epoch": 0.37412935323383084,
664
+ "grad_norm": 0.4177272915840149,
665
  "learning_rate": 1e-06,
666
+ "loss": 0.524,
667
  "step": 94
668
  },
669
  {
670
  "epoch": 0.3781094527363184,
671
+ "grad_norm": 0.4061485230922699,
672
  "learning_rate": 1e-06,
673
+ "loss": 0.5228,
674
  "step": 95
675
  },
676
  {
677
  "epoch": 0.382089552238806,
678
+ "grad_norm": 0.39875149726867676,
679
  "learning_rate": 1e-06,
680
+ "loss": 0.4782,
681
  "step": 96
682
  },
683
  {
684
  "epoch": 0.38606965174129354,
685
+ "grad_norm": 0.4054339528083801,
686
  "learning_rate": 1e-06,
687
+ "loss": 0.4998,
688
  "step": 97
689
  },
690
  {
691
  "epoch": 0.3900497512437811,
692
+ "grad_norm": 0.3824702501296997,
693
  "learning_rate": 1e-06,
694
+ "loss": 0.4701,
695
  "step": 98
696
  },
697
  {
698
  "epoch": 0.3940298507462687,
699
+ "grad_norm": 0.4319639801979065,
700
  "learning_rate": 1e-06,
701
+ "loss": 0.5651,
702
  "step": 99
703
  },
704
  {
705
  "epoch": 0.39800995024875624,
706
+ "grad_norm": 0.39380550384521484,
707
  "learning_rate": 1e-06,
708
+ "loss": 0.4958,
709
  "step": 100
710
  },
711
  {
712
  "epoch": 0.4019900497512438,
713
+ "grad_norm": 0.38747814297676086,
714
  "learning_rate": 1e-06,
715
+ "loss": 0.5067,
716
  "step": 101
717
  },
718
  {
719
  "epoch": 0.4059701492537313,
720
+ "grad_norm": 0.41260620951652527,
721
  "learning_rate": 1e-06,
722
+ "loss": 0.5148,
723
  "step": 102
724
  },
725
  {
726
  "epoch": 0.4099502487562189,
727
+ "grad_norm": 0.3776450455188751,
728
  "learning_rate": 1e-06,
729
+ "loss": 0.5009,
730
  "step": 103
731
  },
732
  {
733
  "epoch": 0.41393034825870645,
734
+ "grad_norm": 0.4281792938709259,
735
  "learning_rate": 1e-06,
736
+ "loss": 0.5878,
737
  "step": 104
738
  },
739
  {
740
  "epoch": 0.417910447761194,
741
+ "grad_norm": 0.4015783965587616,
742
  "learning_rate": 1e-06,
743
+ "loss": 0.5137,
744
  "step": 105
745
  },
746
  {
747
  "epoch": 0.4218905472636816,
748
+ "grad_norm": 0.38817304372787476,
749
  "learning_rate": 1e-06,
750
+ "loss": 0.4847,
751
  "step": 106
752
  },
753
  {
754
  "epoch": 0.42587064676616915,
755
+ "grad_norm": 0.4299408495426178,
756
  "learning_rate": 1e-06,
757
+ "loss": 0.4906,
758
  "step": 107
759
  },
760
  {
761
  "epoch": 0.4298507462686567,
762
+ "grad_norm": 0.3869856595993042,
763
  "learning_rate": 1e-06,
764
+ "loss": 0.5093,
765
  "step": 108
766
  },
767
  {
768
  "epoch": 0.4338308457711443,
769
+ "grad_norm": 0.38563865423202515,
770
  "learning_rate": 1e-06,
771
+ "loss": 0.4807,
772
  "step": 109
773
  },
774
  {
775
  "epoch": 0.43781094527363185,
776
+ "grad_norm": 0.39928150177001953,
777
  "learning_rate": 1e-06,
778
+ "loss": 0.4896,
779
  "step": 110
780
  },
781
  {
782
  "epoch": 0.4417910447761194,
783
+ "grad_norm": 0.39707064628601074,
784
  "learning_rate": 1e-06,
785
+ "loss": 0.5104,
786
  "step": 111
787
  },
788
  {
789
  "epoch": 0.445771144278607,
790
+ "grad_norm": 0.4231569468975067,
791
  "learning_rate": 1e-06,
792
+ "loss": 0.4862,
793
  "step": 112
794
  },
795
  {
796
  "epoch": 0.44975124378109455,
797
+ "grad_norm": 0.4070363938808441,
798
  "learning_rate": 1e-06,
799
+ "loss": 0.4649,
800
  "step": 113
801
  },
802
  {
803
  "epoch": 0.4537313432835821,
804
+ "grad_norm": 0.39055392146110535,
805
  "learning_rate": 1e-06,
806
+ "loss": 0.4679,
807
  "step": 114
808
  },
809
  {
810
  "epoch": 0.4577114427860697,
811
+ "grad_norm": 0.3945823013782501,
812
  "learning_rate": 1e-06,
813
+ "loss": 0.4686,
814
  "step": 115
815
  },
816
  {
817
  "epoch": 0.4616915422885572,
818
+ "grad_norm": 0.37572288513183594,
819
  "learning_rate": 1e-06,
820
+ "loss": 0.4594,
821
  "step": 116
822
  },
823
  {
824
  "epoch": 0.46567164179104475,
825
+ "grad_norm": 0.368758887052536,
826
  "learning_rate": 1e-06,
827
+ "loss": 0.4539,
828
  "step": 117
829
  },
830
  {
831
  "epoch": 0.4696517412935323,
832
+ "grad_norm": 0.37711796164512634,
833
  "learning_rate": 1e-06,
834
+ "loss": 0.4532,
835
  "step": 118
836
  },
837
  {
838
  "epoch": 0.4736318407960199,
839
+ "grad_norm": 0.43404269218444824,
840
  "learning_rate": 1e-06,
841
+ "loss": 0.544,
842
  "step": 119
843
  },
844
  {
845
  "epoch": 0.47761194029850745,
846
+ "grad_norm": 0.3962051272392273,
847
  "learning_rate": 1e-06,
848
+ "loss": 0.471,
849
  "step": 120
850
  },
851
  {
852
  "epoch": 0.481592039800995,
853
+ "grad_norm": 0.3800894618034363,
854
  "learning_rate": 1e-06,
855
+ "loss": 0.4606,
856
  "step": 121
857
  },
858
  {
859
  "epoch": 0.4855721393034826,
860
+ "grad_norm": 0.41219913959503174,
861
  "learning_rate": 1e-06,
862
+ "loss": 0.4979,
863
  "step": 122
864
  },
865
  {
866
  "epoch": 0.48955223880597015,
867
+ "grad_norm": 0.39176592230796814,
868
  "learning_rate": 1e-06,
869
+ "loss": 0.4987,
870
  "step": 123
871
  },
872
  {
873
  "epoch": 0.4935323383084577,
874
+ "grad_norm": 0.39850085973739624,
875
  "learning_rate": 1e-06,
876
+ "loss": 0.5003,
877
  "step": 124
878
  },
879
  {
880
  "epoch": 0.4975124378109453,
881
+ "grad_norm": 0.38991016149520874,
882
  "learning_rate": 1e-06,
883
+ "loss": 0.4912,
884
  "step": 125
885
  },
886
  {
887
  "epoch": 0.5014925373134328,
888
+ "grad_norm": 0.3829534351825714,
889
  "learning_rate": 1e-06,
890
+ "loss": 0.4662,
891
  "step": 126
892
  },
893
  {
894
  "epoch": 0.5054726368159204,
895
+ "grad_norm": 0.36051082611083984,
896
  "learning_rate": 1e-06,
897
+ "loss": 0.4173,
898
  "step": 127
899
  },
900
  {
901
  "epoch": 0.5094527363184079,
902
+ "grad_norm": 0.39890432357788086,
903
  "learning_rate": 1e-06,
904
+ "loss": 0.5032,
905
  "step": 128
906
  },
907
  {
908
  "epoch": 0.5134328358208955,
909
+ "grad_norm": 0.3870425224304199,
910
  "learning_rate": 1e-06,
911
+ "loss": 0.4465,
912
  "step": 129
913
  },
914
  {
915
  "epoch": 0.5174129353233831,
916
+ "grad_norm": 0.37743470072746277,
917
  "learning_rate": 1e-06,
918
+ "loss": 0.4665,
919
  "step": 130
920
  },
921
  {
922
  "epoch": 0.5213930348258706,
923
+ "grad_norm": 0.389920175075531,
924
  "learning_rate": 1e-06,
925
+ "loss": 0.4723,
926
  "step": 131
927
  },
928
  {
929
  "epoch": 0.5253731343283582,
930
+ "grad_norm": 0.3944704830646515,
931
  "learning_rate": 1e-06,
932
+ "loss": 0.4666,
933
  "step": 132
934
  },
935
  {
936
  "epoch": 0.5293532338308458,
937
+ "grad_norm": 0.3912000358104706,
938
  "learning_rate": 1e-06,
939
+ "loss": 0.4649,
940
  "step": 133
941
  },
942
  {
943
  "epoch": 0.5333333333333333,
944
+ "grad_norm": 0.4006643295288086,
945
  "learning_rate": 1e-06,
946
+ "loss": 0.4928,
947
  "step": 134
948
  },
949
  {
950
  "epoch": 0.5373134328358209,
951
+ "grad_norm": 0.3699991703033447,
952
  "learning_rate": 1e-06,
953
+ "loss": 0.4092,
954
  "step": 135
955
  },
956
  {
957
  "epoch": 0.5412935323383085,
958
+ "grad_norm": 0.38336026668548584,
959
  "learning_rate": 1e-06,
960
+ "loss": 0.4419,
961
  "step": 136
962
  },
963
  {
964
  "epoch": 0.545273631840796,
965
+ "grad_norm": 0.35077640414237976,
966
  "learning_rate": 1e-06,
967
+ "loss": 0.4206,
968
  "step": 137
969
  },
970
  {
971
  "epoch": 0.5492537313432836,
972
+ "grad_norm": 0.3833373188972473,
973
  "learning_rate": 1e-06,
974
+ "loss": 0.45,
975
  "step": 138
976
  },
977
  {
978
  "epoch": 0.5532338308457712,
979
+ "grad_norm": 0.38633936643600464,
980
  "learning_rate": 1e-06,
981
+ "loss": 0.4289,
982
  "step": 139
983
  },
984
  {
985
  "epoch": 0.5572139303482587,
986
+ "grad_norm": 0.36919641494750977,
987
  "learning_rate": 1e-06,
988
+ "loss": 0.4515,
989
  "step": 140
990
  },
991
  {
992
  "epoch": 0.5611940298507463,
993
+ "grad_norm": 0.398011714220047,
994
  "learning_rate": 1e-06,
995
+ "loss": 0.4959,
996
  "step": 141
997
  },
998
  {
999
  "epoch": 0.5651741293532339,
1000
+ "grad_norm": 0.38399818539619446,
1001
  "learning_rate": 1e-06,
1002
+ "loss": 0.4676,
1003
  "step": 142
1004
  },
1005
  {
1006
  "epoch": 0.5691542288557214,
1007
+ "grad_norm": 0.389676958322525,
1008
  "learning_rate": 1e-06,
1009
+ "loss": 0.4481,
1010
  "step": 143
1011
  },
1012
  {
1013
  "epoch": 0.573134328358209,
1014
+ "grad_norm": 0.4080444574356079,
1015
  "learning_rate": 1e-06,
1016
+ "loss": 0.508,
1017
  "step": 144
1018
  },
1019
  {
1020
  "epoch": 0.5771144278606966,
1021
+ "grad_norm": 0.36857450008392334,
1022
  "learning_rate": 1e-06,
1023
+ "loss": 0.4124,
1024
  "step": 145
1025
  },
1026
  {
1027
  "epoch": 0.5810945273631841,
1028
+ "grad_norm": 0.39227306842803955,
1029
  "learning_rate": 1e-06,
1030
+ "loss": 0.4373,
1031
  "step": 146
1032
  },
1033
  {
1034
  "epoch": 0.5850746268656717,
1035
+ "grad_norm": 0.35137900710105896,
1036
  "learning_rate": 1e-06,
1037
+ "loss": 0.3968,
1038
  "step": 147
1039
  },
1040
  {
1041
  "epoch": 0.5890547263681593,
1042
+ "grad_norm": 0.3839924931526184,
1043
  "learning_rate": 1e-06,
1044
+ "loss": 0.4501,
1045
  "step": 148
1046
  },
1047
  {
1048
  "epoch": 0.5930348258706468,
1049
+ "grad_norm": 0.40611201524734497,
1050
  "learning_rate": 1e-06,
1051
+ "loss": 0.4623,
1052
  "step": 149
1053
  },
1054
  {
1055
  "epoch": 0.5970149253731343,
1056
+ "grad_norm": 0.40611159801483154,
1057
  "learning_rate": 1e-06,
1058
+ "loss": 0.4752,
1059
  "step": 150
1060
  },
1061
  {
1062
  "epoch": 0.6009950248756218,
1063
+ "grad_norm": 0.3694476783275604,
1064
  "learning_rate": 1e-06,
1065
+ "loss": 0.4645,
1066
  "step": 151
1067
  },
1068
  {
1069
  "epoch": 0.6049751243781094,
1070
+ "grad_norm": 0.3776058554649353,
1071
  "learning_rate": 1e-06,
1072
+ "loss": 0.4398,
1073
  "step": 152
1074
  },
1075
  {
1076
  "epoch": 0.608955223880597,
1077
+ "grad_norm": 0.3951246440410614,
1078
  "learning_rate": 1e-06,
1079
+ "loss": 0.4395,
1080
  "step": 153
1081
  },
1082
  {
1083
  "epoch": 0.6129353233830845,
1084
+ "grad_norm": 0.4008040726184845,
1085
  "learning_rate": 1e-06,
1086
+ "loss": 0.4805,
1087
  "step": 154
1088
  },
1089
  {
1090
  "epoch": 0.6169154228855721,
1091
+ "grad_norm": 0.39640602469444275,
1092
  "learning_rate": 1e-06,
1093
+ "loss": 0.51,
1094
  "step": 155
1095
  },
1096
  {
1097
  "epoch": 0.6208955223880597,
1098
+ "grad_norm": 0.392069548368454,
1099
  "learning_rate": 1e-06,
1100
+ "loss": 0.464,
1101
  "step": 156
1102
  },
1103
  {
1104
  "epoch": 0.6248756218905472,
1105
+ "grad_norm": 0.3650537133216858,
1106
  "learning_rate": 1e-06,
1107
+ "loss": 0.4243,
1108
  "step": 157
1109
  },
1110
  {
1111
  "epoch": 0.6288557213930348,
1112
+ "grad_norm": 0.38185617327690125,
1113
  "learning_rate": 1e-06,
1114
+ "loss": 0.4174,
1115
  "step": 158
1116
  },
1117
  {
1118
  "epoch": 0.6328358208955224,
1119
+ "grad_norm": 0.3775487244129181,
1120
  "learning_rate": 1e-06,
1121
+ "loss": 0.4471,
1122
  "step": 159
1123
  },
1124
  {
1125
  "epoch": 0.6368159203980099,
1126
+ "grad_norm": 0.3903777003288269,
1127
  "learning_rate": 1e-06,
1128
+ "loss": 0.4595,
1129
  "step": 160
1130
  },
1131
  {
1132
  "epoch": 0.6407960199004975,
1133
+ "grad_norm": 0.3664606213569641,
1134
  "learning_rate": 1e-06,
1135
+ "loss": 0.4309,
1136
  "step": 161
1137
  },
1138
  {
1139
  "epoch": 0.6447761194029851,
1140
+ "grad_norm": 0.3931891918182373,
1141
  "learning_rate": 1e-06,
1142
+ "loss": 0.4724,
1143
  "step": 162
1144
  },
1145
  {
1146
  "epoch": 0.6487562189054726,
1147
+ "grad_norm": 0.3914732038974762,
1148
  "learning_rate": 1e-06,
1149
+ "loss": 0.4491,
1150
  "step": 163
1151
  },
1152
  {
1153
  "epoch": 0.6527363184079602,
1154
+ "grad_norm": 0.4103309214115143,
1155
  "learning_rate": 1e-06,
1156
+ "loss": 0.4591,
1157
  "step": 164
1158
  },
1159
  {
1160
  "epoch": 0.6567164179104478,
1161
+ "grad_norm": 0.402506560087204,
1162
  "learning_rate": 1e-06,
1163
+ "loss": 0.463,
1164
  "step": 165
1165
  },
1166
  {
1167
  "epoch": 0.6606965174129353,
1168
+ "grad_norm": 0.36114874482154846,
1169
  "learning_rate": 1e-06,
1170
+ "loss": 0.4156,
1171
  "step": 166
1172
  },
1173
  {
1174
  "epoch": 0.6646766169154229,
1175
+ "grad_norm": 0.3747994899749756,
1176
  "learning_rate": 1e-06,
1177
+ "loss": 0.3993,
1178
  "step": 167
1179
  },
1180
  {
1181
  "epoch": 0.6686567164179105,
1182
+ "grad_norm": 0.3962922692298889,
1183
  "learning_rate": 1e-06,
1184
+ "loss": 0.4604,
1185
  "step": 168
1186
  },
1187
  {
1188
  "epoch": 0.672636815920398,
1189
+ "grad_norm": 0.3930504322052002,
1190
  "learning_rate": 1e-06,
1191
+ "loss": 0.4467,
1192
  "step": 169
1193
  },
1194
  {
1195
  "epoch": 0.6766169154228856,
1196
+ "grad_norm": 0.3787241280078888,
1197
  "learning_rate": 1e-06,
1198
+ "loss": 0.4205,
1199
  "step": 170
1200
  },
1201
  {
1202
  "epoch": 0.6805970149253732,
1203
+ "grad_norm": 0.3823286294937134,
1204
  "learning_rate": 1e-06,
1205
+ "loss": 0.4289,
1206
  "step": 171
1207
  },
1208
  {
1209
  "epoch": 0.6845771144278607,
1210
+ "grad_norm": 0.3922288715839386,
1211
  "learning_rate": 1e-06,
1212
+ "loss": 0.4603,
1213
  "step": 172
1214
  },
1215
  {
1216
  "epoch": 0.6885572139303483,
1217
+ "grad_norm": 0.37836119532585144,
1218
  "learning_rate": 1e-06,
1219
+ "loss": 0.4314,
1220
  "step": 173
1221
  },
1222
  {
1223
  "epoch": 0.6925373134328359,
1224
+ "grad_norm": 0.39417001605033875,
1225
  "learning_rate": 1e-06,
1226
+ "loss": 0.4682,
1227
  "step": 174
1228
  },
1229
  {
1230
  "epoch": 0.6965174129353234,
1231
+ "grad_norm": 0.3708493709564209,
1232
  "learning_rate": 1e-06,
1233
+ "loss": 0.4137,
1234
  "step": 175
1235
  },
1236
  {
1237
  "epoch": 0.700497512437811,
1238
+ "grad_norm": 0.37407782673835754,
1239
  "learning_rate": 1e-06,
1240
+ "loss": 0.4383,
1241
  "step": 176
1242
  },
1243
  {
1244
  "epoch": 0.7044776119402985,
1245
+ "grad_norm": 0.3718623220920563,
1246
  "learning_rate": 1e-06,
1247
+ "loss": 0.4675,
1248
  "step": 177
1249
  },
1250
  {
1251
  "epoch": 0.708457711442786,
1252
+ "grad_norm": 0.3738034963607788,
1253
  "learning_rate": 1e-06,
1254
+ "loss": 0.415,
1255
  "step": 178
1256
  },
1257
  {
1258
  "epoch": 0.7124378109452736,
1259
+ "grad_norm": 0.3950786888599396,
1260
  "learning_rate": 1e-06,
1261
+ "loss": 0.4529,
1262
  "step": 179
1263
  },
1264
  {
1265
  "epoch": 0.7164179104477612,
1266
+ "grad_norm": 0.3632413148880005,
1267
  "learning_rate": 1e-06,
1268
+ "loss": 0.4084,
1269
  "step": 180
1270
  },
1271
  {
1272
  "epoch": 0.7203980099502487,
1273
+ "grad_norm": 0.37233033776283264,
1274
  "learning_rate": 1e-06,
1275
+ "loss": 0.4297,
1276
  "step": 181
1277
  },
1278
  {
1279
  "epoch": 0.7243781094527363,
1280
+ "grad_norm": 0.37420183420181274,
1281
  "learning_rate": 1e-06,
1282
+ "loss": 0.4277,
1283
  "step": 182
1284
  },
1285
  {
1286
  "epoch": 0.7283582089552239,
1287
+ "grad_norm": 0.3963206112384796,
1288
  "learning_rate": 1e-06,
1289
+ "loss": 0.4435,
1290
  "step": 183
1291
  },
1292
  {
1293
  "epoch": 0.7323383084577114,
1294
+ "grad_norm": 0.3932362198829651,
1295
  "learning_rate": 1e-06,
1296
+ "loss": 0.4591,
1297
  "step": 184
1298
  },
1299
  {
1300
  "epoch": 0.736318407960199,
1301
+ "grad_norm": 0.3874463737010956,
1302
  "learning_rate": 1e-06,
1303
+ "loss": 0.4989,
1304
  "step": 185
1305
  },
1306
  {
1307
  "epoch": 0.7402985074626866,
1308
+ "grad_norm": 0.38697943091392517,
1309
  "learning_rate": 1e-06,
1310
+ "loss": 0.4557,
1311
  "step": 186
1312
  },
1313
  {
1314
  "epoch": 0.7442786069651741,
1315
+ "grad_norm": 0.3465070128440857,
1316
  "learning_rate": 1e-06,
1317
+ "loss": 0.44,
1318
  "step": 187
1319
  },
1320
  {
1321
  "epoch": 0.7482587064676617,
1322
+ "grad_norm": 0.4213728904724121,
1323
  "learning_rate": 1e-06,
1324
+ "loss": 0.4519,
1325
  "step": 188
1326
  },
1327
  {
1328
  "epoch": 0.7522388059701492,
1329
+ "grad_norm": 0.38543012738227844,
1330
  "learning_rate": 1e-06,
1331
+ "loss": 0.4341,
1332
  "step": 189
1333
  },
1334
  {
1335
  "epoch": 0.7562189054726368,
1336
+ "grad_norm": 0.37096357345581055,
1337
  "learning_rate": 1e-06,
1338
+ "loss": 0.405,
1339
  "step": 190
1340
  },
1341
  {
1342
  "epoch": 0.7601990049751244,
1343
+ "grad_norm": 0.387590616941452,
1344
  "learning_rate": 1e-06,
1345
+ "loss": 0.4121,
1346
  "step": 191
1347
  },
1348
  {
1349
  "epoch": 0.764179104477612,
1350
+ "grad_norm": 0.3805396854877472,
1351
  "learning_rate": 1e-06,
1352
+ "loss": 0.449,
1353
  "step": 192
1354
  },
1355
  {
1356
  "epoch": 0.7681592039800995,
1357
+ "grad_norm": 0.3715638220310211,
1358
  "learning_rate": 1e-06,
1359
+ "loss": 0.4406,
1360
  "step": 193
1361
  },
1362
  {
1363
  "epoch": 0.7721393034825871,
1364
+ "grad_norm": 0.3621780574321747,
1365
  "learning_rate": 1e-06,
1366
+ "loss": 0.4342,
1367
  "step": 194
1368
  },
1369
  {
1370
  "epoch": 0.7761194029850746,
1371
+ "grad_norm": 0.3717781603336334,
1372
  "learning_rate": 1e-06,
1373
+ "loss": 0.402,
1374
  "step": 195
1375
  },
1376
  {
1377
  "epoch": 0.7800995024875622,
1378
+ "grad_norm": 0.39843422174453735,
1379
  "learning_rate": 1e-06,
1380
+ "loss": 0.4413,
1381
  "step": 196
1382
  },
1383
  {
1384
  "epoch": 0.7840796019900498,
1385
+ "grad_norm": 0.39430034160614014,
1386
  "learning_rate": 1e-06,
1387
+ "loss": 0.4441,
1388
  "step": 197
1389
  },
1390
  {
1391
  "epoch": 0.7880597014925373,
1392
+ "grad_norm": 0.3567056953907013,
1393
  "learning_rate": 1e-06,
1394
+ "loss": 0.4016,
1395
  "step": 198
1396
  },
1397
  {
1398
  "epoch": 0.7920398009950249,
1399
+ "grad_norm": 0.3838033676147461,
1400
  "learning_rate": 1e-06,
1401
+ "loss": 0.4321,
1402
  "step": 199
1403
  },
1404
  {
1405
  "epoch": 0.7960199004975125,
1406
+ "grad_norm": 0.3709229826927185,
1407
  "learning_rate": 1e-06,
1408
+ "loss": 0.4101,
1409
  "step": 200
1410
  },
1411
  {
1412
  "epoch": 0.8,
1413
+ "grad_norm": 0.3614198863506317,
1414
  "learning_rate": 1e-06,
1415
+ "loss": 0.3899,
1416
  "step": 201
1417
  },
1418
  {
1419
  "epoch": 0.8039800995024876,
1420
+ "grad_norm": 0.37470588088035583,
1421
  "learning_rate": 1e-06,
1422
+ "loss": 0.4513,
1423
  "step": 202
1424
  },
1425
  {
1426
  "epoch": 0.8079601990049752,
1427
+ "grad_norm": 0.3992566764354706,
1428
  "learning_rate": 1e-06,
1429
+ "loss": 0.4141,
1430
  "step": 203
1431
  },
1432
  {
1433
  "epoch": 0.8119402985074626,
1434
+ "grad_norm": 0.37168097496032715,
1435
  "learning_rate": 1e-06,
1436
+ "loss": 0.4859,
1437
  "step": 204
1438
  },
1439
  {
1440
  "epoch": 0.8159203980099502,
1441
+ "grad_norm": 0.37951424717903137,
1442
  "learning_rate": 1e-06,
1443
+ "loss": 0.4998,
1444
  "step": 205
1445
  },
1446
  {
1447
  "epoch": 0.8199004975124378,
1448
+ "grad_norm": 0.40707215666770935,
1449
  "learning_rate": 1e-06,
1450
+ "loss": 0.4393,
1451
  "step": 206
1452
  },
1453
  {
1454
  "epoch": 0.8238805970149253,
1455
+ "grad_norm": 0.36012160778045654,
1456
  "learning_rate": 1e-06,
1457
+ "loss": 0.4427,
1458
  "step": 207
1459
  },
1460
  {
1461
  "epoch": 0.8278606965174129,
1462
+ "grad_norm": 0.36330950260162354,
1463
  "learning_rate": 1e-06,
1464
+ "loss": 0.4144,
1465
  "step": 208
1466
  },
1467
  {
1468
  "epoch": 0.8318407960199005,
1469
+ "grad_norm": 0.3411954343318939,
1470
  "learning_rate": 1e-06,
1471
+ "loss": 0.4102,
1472
  "step": 209
1473
  },
1474
  {
1475
  "epoch": 0.835820895522388,
1476
+ "grad_norm": 0.38081133365631104,
1477
  "learning_rate": 1e-06,
1478
+ "loss": 0.4357,
1479
  "step": 210
1480
  },
1481
  {
1482
  "epoch": 0.8398009950248756,
1483
+ "grad_norm": 0.34949833154678345,
1484
  "learning_rate": 1e-06,
1485
+ "loss": 0.4092,
1486
  "step": 211
1487
  },
1488
  {
1489
  "epoch": 0.8437810945273632,
1490
+ "grad_norm": 0.37800830602645874,
1491
  "learning_rate": 1e-06,
1492
+ "loss": 0.4707,
1493
  "step": 212
1494
  },
1495
  {
1496
  "epoch": 0.8477611940298507,
1497
+ "grad_norm": 0.3320818245410919,
1498
  "learning_rate": 1e-06,
1499
+ "loss": 0.3851,
1500
  "step": 213
1501
  },
1502
  {
1503
  "epoch": 0.8517412935323383,
1504
+ "grad_norm": 0.37317031621932983,
1505
  "learning_rate": 1e-06,
1506
+ "loss": 0.4383,
1507
  "step": 214
1508
  },
1509
  {
1510
  "epoch": 0.8557213930348259,
1511
+ "grad_norm": 0.359311044216156,
1512
  "learning_rate": 1e-06,
1513
+ "loss": 0.4377,
1514
  "step": 215
1515
  },
1516
  {
1517
  "epoch": 0.8597014925373134,
1518
+ "grad_norm": 0.36889392137527466,
1519
  "learning_rate": 1e-06,
1520
+ "loss": 0.4049,
1521
  "step": 216
1522
  },
1523
  {
1524
  "epoch": 0.863681592039801,
1525
+ "grad_norm": 0.3456409275531769,
1526
  "learning_rate": 1e-06,
1527
+ "loss": 0.4379,
1528
  "step": 217
1529
  },
1530
  {
1531
  "epoch": 0.8676616915422886,
1532
+ "grad_norm": 0.3650165796279907,
1533
  "learning_rate": 1e-06,
1534
+ "loss": 0.418,
1535
  "step": 218
1536
  },
1537
  {
1538
  "epoch": 0.8716417910447761,
1539
+ "grad_norm": 0.3336452543735504,
1540
  "learning_rate": 1e-06,
1541
+ "loss": 0.4024,
1542
  "step": 219
1543
  },
1544
  {
1545
  "epoch": 0.8756218905472637,
1546
+ "grad_norm": 0.3477398157119751,
1547
  "learning_rate": 1e-06,
1548
+ "loss": 0.4429,
1549
  "step": 220
1550
  },
1551
  {
1552
  "epoch": 0.8796019900497513,
1553
+ "grad_norm": 0.32343724370002747,
1554
  "learning_rate": 1e-06,
1555
+ "loss": 0.3906,
1556
  "step": 221
1557
  },
1558
  {
1559
  "epoch": 0.8835820895522388,
1560
+ "grad_norm": 0.3510162830352783,
1561
  "learning_rate": 1e-06,
1562
+ "loss": 0.4281,
1563
  "step": 222
1564
  },
1565
  {
1566
  "epoch": 0.8875621890547264,
1567
+ "grad_norm": 0.34209561347961426,
1568
  "learning_rate": 1e-06,
1569
+ "loss": 0.4144,
1570
  "step": 223
1571
  },
1572
  {
1573
  "epoch": 0.891542288557214,
1574
+ "grad_norm": 0.33722636103630066,
1575
  "learning_rate": 1e-06,
1576
+ "loss": 0.4049,
1577
  "step": 224
1578
  },
1579
  {
1580
  "epoch": 0.8955223880597015,
1581
+ "grad_norm": 0.36703479290008545,
1582
  "learning_rate": 1e-06,
1583
+ "loss": 0.4383,
1584
  "step": 225
1585
  },
1586
  {
1587
  "epoch": 0.8995024875621891,
1588
+ "grad_norm": 0.3338935375213623,
1589
  "learning_rate": 1e-06,
1590
+ "loss": 0.397,
1591
  "step": 226
1592
  },
1593
  {
1594
  "epoch": 0.9034825870646767,
1595
+ "grad_norm": 0.3336848318576813,
1596
  "learning_rate": 1e-06,
1597
+ "loss": 0.4111,
1598
  "step": 227
1599
  },
1600
  {
1601
  "epoch": 0.9074626865671642,
1602
+ "grad_norm": 0.33100321888923645,
1603
  "learning_rate": 1e-06,
1604
+ "loss": 0.4093,
1605
  "step": 228
1606
  },
1607
  {
1608
  "epoch": 0.9114427860696518,
1609
+ "grad_norm": 0.33970826864242554,
1610
  "learning_rate": 1e-06,
1611
+ "loss": 0.39,
1612
  "step": 229
1613
  },
1614
  {
1615
  "epoch": 0.9154228855721394,
1616
+ "grad_norm": 0.3213025629520416,
1617
  "learning_rate": 1e-06,
1618
+ "loss": 0.4266,
1619
  "step": 230
1620
  },
1621
  {
1622
  "epoch": 0.9194029850746268,
1623
+ "grad_norm": 0.31836453080177307,
1624
  "learning_rate": 1e-06,
1625
+ "loss": 0.3955,
1626
  "step": 231
1627
  },
1628
  {
1629
  "epoch": 0.9233830845771144,
1630
+ "grad_norm": 0.33137357234954834,
1631
  "learning_rate": 1e-06,
1632
+ "loss": 0.4623,
1633
  "step": 232
1634
  },
1635
  {
1636
  "epoch": 0.9273631840796019,
1637
+ "grad_norm": 0.3153519034385681,
1638
  "learning_rate": 1e-06,
1639
+ "loss": 0.4005,
1640
  "step": 233
1641
  },
1642
  {
1643
  "epoch": 0.9313432835820895,
1644
+ "grad_norm": 0.3291514217853546,
1645
  "learning_rate": 1e-06,
1646
+ "loss": 0.4156,
1647
  "step": 234
1648
  },
1649
  {
1650
  "epoch": 0.9353233830845771,
1651
+ "grad_norm": 0.31078243255615234,
1652
  "learning_rate": 1e-06,
1653
+ "loss": 0.3792,
1654
  "step": 235
1655
  },
1656
  {
1657
  "epoch": 0.9393034825870646,
1658
+ "grad_norm": 0.32149940729141235,
1659
  "learning_rate": 1e-06,
1660
+ "loss": 0.3913,
1661
  "step": 236
1662
  },
1663
  {
1664
  "epoch": 0.9432835820895522,
1665
+ "grad_norm": 0.3273887634277344,
1666
  "learning_rate": 1e-06,
1667
+ "loss": 0.3881,
1668
  "step": 237
1669
  },
1670
  {
1671
  "epoch": 0.9472636815920398,
1672
+ "grad_norm": 0.33803319931030273,
1673
  "learning_rate": 1e-06,
1674
+ "loss": 0.4183,
1675
  "step": 238
1676
  },
1677
  {
1678
  "epoch": 0.9512437810945273,
1679
+ "grad_norm": 0.30867457389831543,
1680
  "learning_rate": 1e-06,
1681
+ "loss": 0.3875,
1682
  "step": 239
1683
  },
1684
  {
1685
  "epoch": 0.9552238805970149,
1686
+ "grad_norm": 0.30503326654434204,
1687
  "learning_rate": 1e-06,
1688
+ "loss": 0.383,
1689
  "step": 240
1690
  },
1691
  {
1692
  "epoch": 0.9592039800995025,
1693
+ "grad_norm": 0.28712713718414307,
1694
  "learning_rate": 1e-06,
1695
+ "loss": 0.371,
1696
  "step": 241
1697
  },
1698
  {
1699
  "epoch": 0.96318407960199,
1700
+ "grad_norm": 0.2912638783454895,
1701
  "learning_rate": 1e-06,
1702
+ "loss": 0.4035,
1703
  "step": 242
1704
  },
1705
  {
1706
  "epoch": 0.9671641791044776,
1707
+ "grad_norm": 0.2781634032726288,
1708
  "learning_rate": 1e-06,
1709
+ "loss": 0.3848,
1710
  "step": 243
1711
  },
1712
  {
1713
  "epoch": 0.9711442786069652,
1714
+ "grad_norm": 0.3457436263561249,
1715
  "learning_rate": 1e-06,
1716
+ "loss": 0.3857,
1717
  "step": 244
1718
  },
1719
  {
1720
  "epoch": 0.9751243781094527,
1721
+ "grad_norm": 0.3145085871219635,
1722
  "learning_rate": 1e-06,
1723
+ "loss": 0.4167,
1724
  "step": 245
1725
  },
1726
  {
1727
  "epoch": 0.9791044776119403,
1728
+ "grad_norm": 0.3100230097770691,
1729
  "learning_rate": 1e-06,
1730
+ "loss": 0.4051,
1731
  "step": 246
1732
  },
1733
  {
1734
  "epoch": 0.9830845771144279,
1735
+ "grad_norm": 0.29953938722610474,
1736
  "learning_rate": 1e-06,
1737
+ "loss": 0.3985,
1738
  "step": 247
1739
  },
1740
  {
1741
  "epoch": 0.9870646766169154,
1742
+ "grad_norm": 0.27779054641723633,
1743
  "learning_rate": 1e-06,
1744
+ "loss": 0.355,
1745
  "step": 248
1746
  },
1747
  {
1748
  "epoch": 0.991044776119403,
1749
+ "grad_norm": 0.3015543818473816,
1750
  "learning_rate": 1e-06,
1751
+ "loss": 0.3636,
1752
  "step": 249
1753
  },
1754
  {
1755
  "epoch": 0.9950248756218906,
1756
+ "grad_norm": 0.2899494767189026,
1757
  "learning_rate": 1e-06,
1758
+ "loss": 0.3871,
1759
  "step": 250
1760
  },
1761
  {
1762
  "epoch": 0.9990049751243781,
1763
+ "grad_norm": 0.30889588594436646,
1764
  "learning_rate": 1e-06,
1765
+ "loss": 0.3905,
1766
  "step": 251
1767
  },
1768
  {
1769
  "epoch": 0.9990049751243781,
1770
  "step": 251,
1771
+ "total_flos": 1.3756419824156672e+17,
1772
+ "train_loss": 0.5190648520847716,
1773
+ "train_runtime": 16049.3674,
1774
+ "train_samples_per_second": 0.751,
1775
+ "train_steps_per_second": 0.016
1776
  }
1777
  ],
1778
  "logging_steps": 1,
 
1792
  "attributes": {}
1793
  }
1794
  },
1795
+ "total_flos": 1.3756419824156672e+17,
1796
  "train_batch_size": 1,
1797
  "trial_name": null,
1798
  "trial_params": null