hghaan commited on
Commit
f9e562e
·
verified ·
1 Parent(s): d6121fd

update file

Browse files
adapter_config.json CHANGED
@@ -20,12 +20,12 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "encoder.block.0.layer.0.SelfAttention.q",
24
  "decoder.block.0.layer.0.SelfAttention.q",
 
25
  "encoder.block.0.layer.0.SelfAttention.v",
26
  "decoder.block.0.layer.0.SelfAttention.k",
27
- "encoder.block.0.layer.0.SelfAttention.k",
28
- "decoder.block.0.layer.0.SelfAttention.v"
29
  ],
30
  "task_type": "SEQ2SEQ_LM",
31
  "use_dora": false,
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
+ "decoder.block.0.layer.0.SelfAttention.v",
24
  "decoder.block.0.layer.0.SelfAttention.q",
25
+ "encoder.block.0.layer.0.SelfAttention.k",
26
  "encoder.block.0.layer.0.SelfAttention.v",
27
  "decoder.block.0.layer.0.SelfAttention.k",
28
+ "encoder.block.0.layer.0.SelfAttention.q"
 
29
  ],
30
  "task_type": "SEQ2SEQ_LM",
31
  "use_dora": false,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:65888f0de5ec92baef29077564e77fedd9844936f582be94e36eaf7a08fa51a1
3
  size 1181328
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7804b95cebed4f6f8db8baaf5f979f72a108aa34b20499c4341cebfb6f67cfe2
3
  size 1181328
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8d99aa47f1373e797a786636450d11ce1bedf1e455ecc4f1adb13558d091ec34
3
  size 2366982
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f872f390b056e639ab5bbf69899d835411f08590b0b39576bf331a5b98b96c9b
3
  size 2366982
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:888d142298de620b1c05abd3048212502b637c5c30e2c251c2040a31bcf632ab
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00eadd813b6ef57e893ab6475e3a551bc56d211d5c977084ebbb2c8bcff789b1
3
  size 14244
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:313d9e6a2e2433f67a0da276d33bd1e89d9178f1f5f7d57255849b717b7138ef
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d2400a287bc72199e423eecabbc1d481a4ec8fef2fda26a5cc179ecdd2357caf
3
  size 1064
trainer_state.json CHANGED
@@ -3,616 +3,1218 @@
3
  "best_model_checkpoint": null,
4
  "epoch": 4.0,
5
  "eval_steps": 500,
6
- "global_step": 8668,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.046146746654360866,
13
- "grad_norm": 0.21824534237384796,
14
- "learning_rate": 2.23760092272203e-06,
15
- "loss": 3.136,
16
  "step": 100
17
  },
18
  {
19
- "epoch": 0.09229349330872173,
20
- "grad_norm": 0.2850794494152069,
21
- "learning_rate": 4.544405997693195e-06,
22
- "loss": 3.1073,
23
  "step": 200
24
  },
25
  {
26
- "epoch": 0.1384402399630826,
27
- "grad_norm": 0.36149144172668457,
28
- "learning_rate": 6.828143021914648e-06,
29
- "loss": 3.1035,
30
  "step": 300
31
  },
32
  {
33
- "epoch": 0.18458698661744347,
34
- "grad_norm": 0.5060057640075684,
35
- "learning_rate": 9.134948096885815e-06,
36
- "loss": 3.106,
37
  "step": 400
38
  },
39
  {
40
- "epoch": 0.23073373327180433,
41
- "grad_norm": 0.3851727247238159,
42
- "learning_rate": 1.144175317185698e-05,
43
- "loss": 3.0204,
44
  "step": 500
45
  },
46
  {
47
- "epoch": 0.2768804799261652,
48
- "grad_norm": 0.30784064531326294,
49
- "learning_rate": 1.3748558246828143e-05,
50
- "loss": 2.9708,
51
  "step": 600
52
  },
53
  {
54
- "epoch": 0.3230272265805261,
55
- "grad_norm": 0.6206746697425842,
56
- "learning_rate": 1.605536332179931e-05,
57
- "loss": 2.8294,
58
  "step": 700
59
  },
60
  {
61
- "epoch": 0.36917397323488693,
62
- "grad_norm": 0.47670331597328186,
63
- "learning_rate": 1.8362168396770474e-05,
64
- "loss": 2.6824,
65
  "step": 800
66
  },
67
  {
68
- "epoch": 0.4153207198892478,
69
- "grad_norm": 0.4056473672389984,
70
- "learning_rate": 1.9925650557620818e-05,
71
- "loss": 2.4481,
72
  "step": 900
73
  },
74
  {
75
- "epoch": 0.46146746654360865,
76
- "grad_norm": 0.8275523781776428,
77
- "learning_rate": 1.96692731701064e-05,
78
- "loss": 2.0463,
79
  "step": 1000
80
  },
81
  {
82
- "epoch": 0.5076142131979695,
83
- "grad_norm": 0.5123036503791809,
84
- "learning_rate": 1.9412895782591976e-05,
85
- "loss": 1.9237,
86
  "step": 1100
87
  },
88
  {
89
- "epoch": 0.5537609598523304,
90
- "grad_norm": 0.7184427976608276,
91
- "learning_rate": 1.9156518395077554e-05,
92
- "loss": 1.881,
93
  "step": 1200
94
  },
95
  {
96
- "epoch": 0.5999077065066912,
97
- "grad_norm": 1.0534940958023071,
98
- "learning_rate": 1.8900141007563134e-05,
99
- "loss": 1.8219,
100
  "step": 1300
101
  },
102
  {
103
- "epoch": 0.6460544531610521,
104
- "grad_norm": 0.9063017964363098,
105
- "learning_rate": 1.8646327393923858e-05,
106
- "loss": 1.7868,
107
  "step": 1400
108
  },
109
  {
110
- "epoch": 0.6922011998154131,
111
- "grad_norm": 0.7229586839675903,
112
- "learning_rate": 1.8389950006409436e-05,
113
- "loss": 1.7581,
114
  "step": 1500
115
  },
116
  {
117
- "epoch": 0.7383479464697739,
118
- "grad_norm": 0.6040017604827881,
119
- "learning_rate": 1.8133572618895013e-05,
120
- "loss": 1.7096,
121
  "step": 1600
122
  },
123
  {
124
- "epoch": 0.7844946931241348,
125
- "grad_norm": 0.7014355659484863,
126
- "learning_rate": 1.7877195231380594e-05,
127
- "loss": 1.6372,
128
  "step": 1700
129
  },
130
  {
131
- "epoch": 0.8306414397784956,
132
- "grad_norm": 0.45642441511154175,
133
- "learning_rate": 1.762081784386617e-05,
134
- "loss": 1.5638,
135
  "step": 1800
136
  },
137
  {
138
- "epoch": 0.8767881864328565,
139
- "grad_norm": 0.7556698322296143,
140
- "learning_rate": 1.7364440456351752e-05,
141
- "loss": 1.5481,
142
  "step": 1900
143
  },
144
  {
145
- "epoch": 0.9229349330872173,
146
- "grad_norm": 0.48637843132019043,
147
- "learning_rate": 1.710806306883733e-05,
148
- "loss": 1.4976,
149
  "step": 2000
150
  },
151
  {
152
- "epoch": 0.9690816797415782,
153
- "grad_norm": 0.6813339591026306,
154
- "learning_rate": 1.685168568132291e-05,
155
- "loss": 1.4417,
156
  "step": 2100
157
  },
158
  {
159
- "epoch": 1.015228426395939,
160
- "grad_norm": 1.094603180885315,
161
- "learning_rate": 1.6595308293808488e-05,
162
- "loss": 1.3938,
163
  "step": 2200
164
  },
165
  {
166
- "epoch": 1.0613751730503,
167
- "grad_norm": 0.37147483229637146,
168
- "learning_rate": 1.6338930906294065e-05,
169
- "loss": 1.3497,
170
  "step": 2300
171
  },
172
  {
173
- "epoch": 1.1075219197046609,
174
- "grad_norm": 0.2780097424983978,
175
- "learning_rate": 1.6082553518779646e-05,
176
- "loss": 1.3099,
177
  "step": 2400
178
  },
179
  {
180
- "epoch": 1.1536686663590217,
181
- "grad_norm": 0.271342933177948,
182
- "learning_rate": 1.5826176131265223e-05,
183
- "loss": 1.285,
184
  "step": 2500
185
  },
186
  {
187
- "epoch": 1.1998154130133827,
188
- "grad_norm": 0.27299413084983826,
189
- "learning_rate": 1.55697987437508e-05,
190
- "loss": 1.2614,
191
  "step": 2600
192
  },
193
  {
194
- "epoch": 1.2459621596677435,
195
- "grad_norm": 0.29231297969818115,
196
- "learning_rate": 1.531342135623638e-05,
197
- "loss": 1.2308,
198
  "step": 2700
199
  },
200
  {
201
- "epoch": 1.2921089063221043,
202
- "grad_norm": 0.22232797741889954,
203
- "learning_rate": 1.505704396872196e-05,
204
- "loss": 1.1944,
205
  "step": 2800
206
  },
207
  {
208
- "epoch": 1.338255652976465,
209
- "grad_norm": 0.9203324913978577,
210
- "learning_rate": 1.480066658120754e-05,
211
- "loss": 1.1544,
212
  "step": 2900
213
  },
214
  {
215
- "epoch": 1.384402399630826,
216
- "grad_norm": 0.19580288231372833,
217
- "learning_rate": 1.4544289193693117e-05,
218
- "loss": 1.0996,
219
  "step": 3000
220
  },
221
  {
222
- "epoch": 1.430549146285187,
223
- "grad_norm": 0.42558759450912476,
224
- "learning_rate": 1.4287911806178696e-05,
225
- "loss": 1.0759,
226
  "step": 3100
227
  },
228
  {
229
- "epoch": 1.4766958929395477,
230
- "grad_norm": 0.24304209649562836,
231
- "learning_rate": 1.4031534418664275e-05,
232
- "loss": 1.0637,
233
  "step": 3200
234
  },
235
  {
236
- "epoch": 1.5228426395939088,
237
- "grad_norm": 0.25986990332603455,
238
- "learning_rate": 1.3775157031149852e-05,
239
- "loss": 1.0446,
240
  "step": 3300
241
  },
242
  {
243
- "epoch": 1.5689893862482696,
244
- "grad_norm": 0.20725102722644806,
245
- "learning_rate": 1.3518779643635433e-05,
246
- "loss": 1.0384,
247
  "step": 3400
248
  },
249
  {
250
- "epoch": 1.6151361329026304,
251
- "grad_norm": 0.18936870992183685,
252
- "learning_rate": 1.3262402256121012e-05,
253
- "loss": 1.0139,
254
  "step": 3500
255
  },
256
  {
257
- "epoch": 1.6612828795569912,
258
- "grad_norm": 0.19966499507427216,
259
- "learning_rate": 1.300602486860659e-05,
260
- "loss": 1.0058,
261
  "step": 3600
262
  },
263
  {
264
- "epoch": 1.707429626211352,
265
- "grad_norm": 0.30528759956359863,
266
- "learning_rate": 1.2749647481092169e-05,
267
- "loss": 0.9838,
268
  "step": 3700
269
  },
270
  {
271
- "epoch": 1.753576372865713,
272
- "grad_norm": 0.2316664308309555,
273
- "learning_rate": 1.2493270093577748e-05,
274
- "loss": 0.9904,
275
  "step": 3800
276
  },
277
  {
278
- "epoch": 1.7997231195200738,
279
- "grad_norm": 0.2217002511024475,
280
- "learning_rate": 1.2236892706063325e-05,
281
- "loss": 0.9735,
282
  "step": 3900
283
  },
284
  {
285
- "epoch": 1.8458698661744348,
286
- "grad_norm": 0.2654038369655609,
287
- "learning_rate": 1.1980515318548904e-05,
288
- "loss": 0.9804,
289
  "step": 4000
290
  },
291
  {
292
- "epoch": 1.8920166128287956,
293
- "grad_norm": 0.20543397963047028,
294
- "learning_rate": 1.1724137931034483e-05,
295
- "loss": 0.9424,
296
  "step": 4100
297
  },
298
  {
299
- "epoch": 1.9381633594831564,
300
- "grad_norm": 0.24414564669132233,
301
- "learning_rate": 1.1467760543520064e-05,
302
- "loss": 0.9353,
303
  "step": 4200
304
  },
305
  {
306
- "epoch": 1.9843101061375172,
307
- "grad_norm": 0.19333013892173767,
308
- "learning_rate": 1.1211383156005641e-05,
309
- "loss": 0.9374,
310
  "step": 4300
311
  },
312
  {
313
- "epoch": 2.030456852791878,
314
- "grad_norm": 0.20063996315002441,
315
- "learning_rate": 1.095500576849122e-05,
316
- "loss": 0.9409,
317
  "step": 4400
318
  },
319
  {
320
- "epoch": 2.076603599446239,
321
- "grad_norm": 0.4319429397583008,
322
- "learning_rate": 1.0698628380976798e-05,
323
- "loss": 0.93,
324
  "step": 4500
325
  },
326
  {
327
- "epoch": 2.1227503461006,
328
- "grad_norm": 0.21358811855316162,
329
- "learning_rate": 1.0442250993462377e-05,
330
- "loss": 0.9231,
331
  "step": 4600
332
  },
333
  {
334
- "epoch": 2.168897092754961,
335
- "grad_norm": 0.2252470701932907,
336
- "learning_rate": 1.0185873605947956e-05,
337
- "loss": 0.9267,
338
  "step": 4700
339
  },
340
  {
341
- "epoch": 2.2150438394093217,
342
- "grad_norm": 0.6058911681175232,
343
- "learning_rate": 9.929496218433535e-06,
344
- "loss": 0.902,
345
  "step": 4800
346
  },
347
  {
348
- "epoch": 2.2611905860636825,
349
- "grad_norm": 0.27027812600135803,
350
- "learning_rate": 9.673118830919114e-06,
351
- "loss": 0.8908,
352
  "step": 4900
353
  },
354
  {
355
- "epoch": 2.3073373327180433,
356
- "grad_norm": 0.3116415739059448,
357
- "learning_rate": 9.416741443404692e-06,
358
- "loss": 0.8971,
359
  "step": 5000
360
  },
361
  {
362
- "epoch": 2.353484079372404,
363
- "grad_norm": 0.2324889898300171,
364
- "learning_rate": 9.160364055890272e-06,
365
- "loss": 0.8927,
366
  "step": 5100
367
  },
368
  {
369
- "epoch": 2.3996308260267654,
370
- "grad_norm": 0.18322697281837463,
371
- "learning_rate": 8.90398666837585e-06,
372
- "loss": 0.8844,
373
  "step": 5200
374
  },
375
  {
376
- "epoch": 2.445777572681126,
377
- "grad_norm": 1.4241108894348145,
378
- "learning_rate": 8.650173054736572e-06,
379
- "loss": 0.883,
380
  "step": 5300
381
  },
382
  {
383
- "epoch": 2.491924319335487,
384
- "grad_norm": 0.22246557474136353,
385
- "learning_rate": 8.393795667222153e-06,
386
- "loss": 0.8841,
387
  "step": 5400
388
  },
389
  {
390
- "epoch": 2.5380710659898478,
391
- "grad_norm": 0.19322210550308228,
392
- "learning_rate": 8.13741827970773e-06,
393
- "loss": 0.89,
394
  "step": 5500
395
  },
396
  {
397
- "epoch": 2.5842178126442086,
398
- "grad_norm": 0.20664915442466736,
399
- "learning_rate": 7.881040892193309e-06,
400
- "loss": 0.8762,
401
  "step": 5600
402
  },
403
  {
404
- "epoch": 2.6303645592985694,
405
- "grad_norm": 0.19776581227779388,
406
- "learning_rate": 7.624663504678887e-06,
407
- "loss": 0.8649,
408
  "step": 5700
409
  },
410
  {
411
- "epoch": 2.67651130595293,
412
- "grad_norm": 0.7888526916503906,
413
- "learning_rate": 7.368286117164467e-06,
414
- "loss": 0.8678,
415
  "step": 5800
416
  },
417
  {
418
- "epoch": 2.722658052607291,
419
- "grad_norm": 0.7994652390480042,
420
- "learning_rate": 7.1119087296500455e-06,
421
- "loss": 0.7896,
422
  "step": 5900
423
  },
424
  {
425
- "epoch": 2.768804799261652,
426
- "grad_norm": 0.2278624325990677,
427
- "learning_rate": 6.855531342135624e-06,
428
- "loss": 0.7324,
429
  "step": 6000
430
  },
431
  {
432
- "epoch": 2.814951545916013,
433
- "grad_norm": 0.24736915528774261,
434
- "learning_rate": 6.599153954621203e-06,
435
- "loss": 0.7139,
436
  "step": 6100
437
  },
438
  {
439
- "epoch": 2.861098292570374,
440
- "grad_norm": 0.22141049802303314,
441
- "learning_rate": 6.342776567106782e-06,
442
- "loss": 0.7116,
443
  "step": 6200
444
  },
445
  {
446
- "epoch": 2.9072450392247347,
447
- "grad_norm": 0.21581608057022095,
448
- "learning_rate": 6.08639917959236e-06,
449
- "loss": 0.7185,
450
  "step": 6300
451
  },
452
  {
453
- "epoch": 2.9533917858790955,
454
- "grad_norm": 0.27274981141090393,
455
- "learning_rate": 5.830021792077939e-06,
456
- "loss": 0.7114,
457
  "step": 6400
458
  },
459
  {
460
- "epoch": 2.9995385325334563,
461
- "grad_norm": 0.4279099106788635,
462
- "learning_rate": 5.5736444045635175e-06,
463
- "loss": 0.6974,
464
  "step": 6500
465
  },
466
  {
467
- "epoch": 3.045685279187817,
468
- "grad_norm": 0.4010777473449707,
469
- "learning_rate": 5.3172670170490966e-06,
470
- "loss": 0.7089,
471
  "step": 6600
472
  },
473
  {
474
- "epoch": 3.0918320258421783,
475
- "grad_norm": 0.4470697343349457,
476
- "learning_rate": 5.060889629534676e-06,
477
- "loss": 0.6895,
478
  "step": 6700
479
  },
480
  {
481
- "epoch": 3.137978772496539,
482
- "grad_norm": 0.534737229347229,
483
- "learning_rate": 4.804512242020255e-06,
484
- "loss": 0.709,
485
  "step": 6800
486
  },
487
  {
488
- "epoch": 3.1841255191509,
489
- "grad_norm": 0.3858148157596588,
490
- "learning_rate": 4.548134854505833e-06,
491
- "loss": 0.7008,
492
  "step": 6900
493
  },
494
  {
495
- "epoch": 3.2302722658052607,
496
- "grad_norm": 4.884620189666748,
497
- "learning_rate": 4.291757466991412e-06,
498
- "loss": 0.6926,
499
  "step": 7000
500
  },
501
  {
502
- "epoch": 3.2764190124596215,
503
- "grad_norm": 0.44127726554870605,
504
- "learning_rate": 4.03538007947699e-06,
505
- "loss": 0.6986,
506
  "step": 7100
507
  },
508
  {
509
- "epoch": 3.3225657591139823,
510
- "grad_norm": 1.2448310852050781,
511
- "learning_rate": 3.7790026919625694e-06,
512
- "loss": 0.6884,
513
  "step": 7200
514
  },
515
  {
516
- "epoch": 3.368712505768343,
517
- "grad_norm": 0.8101204633712769,
518
- "learning_rate": 3.522625304448148e-06,
519
- "loss": 0.6851,
520
  "step": 7300
521
  },
522
  {
523
- "epoch": 3.4148592524227044,
524
- "grad_norm": 0.5153388381004333,
525
- "learning_rate": 3.2662479169337267e-06,
526
- "loss": 0.6938,
527
  "step": 7400
528
  },
529
  {
530
- "epoch": 3.461005999077065,
531
- "grad_norm": 0.33079493045806885,
532
- "learning_rate": 3.0098705294193053e-06,
533
- "loss": 0.6948,
534
  "step": 7500
535
  },
536
  {
537
- "epoch": 3.507152745731426,
538
- "grad_norm": 1.0203328132629395,
539
- "learning_rate": 2.7534931419048844e-06,
540
- "loss": 0.6855,
541
  "step": 7600
542
  },
543
  {
544
- "epoch": 3.553299492385787,
545
- "grad_norm": 0.3520820140838623,
546
- "learning_rate": 2.4971157543904627e-06,
547
- "loss": 0.6849,
548
  "step": 7700
549
  },
550
  {
551
- "epoch": 3.5994462390401476,
552
- "grad_norm": 0.28180956840515137,
553
- "learning_rate": 2.2407383668760417e-06,
554
- "loss": 0.699,
555
  "step": 7800
556
  },
557
  {
558
- "epoch": 3.6455929856945084,
559
- "grad_norm": 0.33973556756973267,
560
- "learning_rate": 1.9843609793616204e-06,
561
- "loss": 0.6756,
562
  "step": 7900
563
  },
564
  {
565
- "epoch": 3.6917397323488697,
566
- "grad_norm": 0.3416615128517151,
567
- "learning_rate": 1.7279835918471993e-06,
568
- "loss": 0.685,
569
  "step": 8000
570
  },
571
  {
572
- "epoch": 3.7378864790032305,
573
- "grad_norm": 0.7213825583457947,
574
- "learning_rate": 1.4716062043327781e-06,
575
- "loss": 0.6764,
576
  "step": 8100
577
  },
578
  {
579
- "epoch": 3.7840332256575913,
580
- "grad_norm": 0.5637441873550415,
581
- "learning_rate": 1.2152288168183566e-06,
582
- "loss": 0.6896,
583
  "step": 8200
584
  },
585
  {
586
- "epoch": 3.830179972311952,
587
- "grad_norm": 0.4190536141395569,
588
- "learning_rate": 9.588514293039355e-07,
589
- "loss": 0.675,
590
  "step": 8300
591
  },
592
  {
593
- "epoch": 3.876326718966313,
594
- "grad_norm": 0.37957823276519775,
595
- "learning_rate": 7.024740417895142e-07,
596
- "loss": 0.6767,
597
  "step": 8400
598
  },
599
  {
600
- "epoch": 3.9224734656206737,
601
- "grad_norm": 0.5404504537582397,
602
- "learning_rate": 4.4609665427509294e-07,
603
- "loss": 0.695,
604
  "step": 8500
605
  },
606
  {
607
- "epoch": 3.9686202122750345,
608
- "grad_norm": 14.911314964294434,
609
- "learning_rate": 1.8971926676067174e-07,
610
- "loss": 0.6794,
611
  "step": 8600
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
612
  }
613
  ],
614
  "logging_steps": 100,
615
- "max_steps": 8668,
616
  "num_input_tokens_seen": 0,
617
  "num_train_epochs": 4,
618
  "save_steps": 100,
@@ -628,7 +1230,7 @@
628
  "attributes": {}
629
  }
630
  },
631
- "total_flos": 1.884583671986995e+16,
632
  "train_batch_size": 4,
633
  "trial_name": null,
634
  "trial_params": null
 
3
  "best_model_checkpoint": null,
4
  "epoch": 4.0,
5
  "eval_steps": 500,
6
+ "global_step": 17216,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.023234200743494422,
13
+ "grad_norm": 0.33875614404678345,
14
+ "learning_rate": 1.1149825783972125e-06,
15
+ "loss": 3.2044,
16
  "step": 100
17
  },
18
  {
19
+ "epoch": 0.046468401486988845,
20
+ "grad_norm": 0.09059225022792816,
21
+ "learning_rate": 2.2764227642276426e-06,
22
+ "loss": 3.1582,
23
  "step": 200
24
  },
25
  {
26
+ "epoch": 0.06970260223048327,
27
+ "grad_norm": 0.24917162954807281,
28
+ "learning_rate": 3.4378629500580724e-06,
29
+ "loss": 3.1608,
30
  "step": 300
31
  },
32
  {
33
+ "epoch": 0.09293680297397769,
34
+ "grad_norm": 0.465605229139328,
35
+ "learning_rate": 4.599303135888502e-06,
36
+ "loss": 3.1981,
37
  "step": 400
38
  },
39
  {
40
+ "epoch": 0.11617100371747212,
41
+ "grad_norm": 0.27495619654655457,
42
+ "learning_rate": 5.7607433217189324e-06,
43
+ "loss": 3.1815,
44
  "step": 500
45
  },
46
  {
47
+ "epoch": 0.13940520446096655,
48
+ "grad_norm": 0.19188807904720306,
49
+ "learning_rate": 6.922183507549362e-06,
50
+ "loss": 3.1294,
51
  "step": 600
52
  },
53
  {
54
+ "epoch": 0.16263940520446096,
55
+ "grad_norm": 0.5246957540512085,
56
+ "learning_rate": 8.083623693379791e-06,
57
+ "loss": 3.0677,
58
  "step": 700
59
  },
60
  {
61
+ "epoch": 0.18587360594795538,
62
+ "grad_norm": 0.258408784866333,
63
+ "learning_rate": 9.24506387921022e-06,
64
+ "loss": 2.9832,
65
  "step": 800
66
  },
67
  {
68
+ "epoch": 0.20910780669144982,
69
+ "grad_norm": 0.31014084815979004,
70
+ "learning_rate": 1.0406504065040652e-05,
71
+ "loss": 2.9743,
72
  "step": 900
73
  },
74
  {
75
+ "epoch": 0.23234200743494424,
76
+ "grad_norm": 0.4873325824737549,
77
+ "learning_rate": 1.1567944250871081e-05,
78
+ "loss": 2.8721,
79
  "step": 1000
80
  },
81
  {
82
+ "epoch": 0.2555762081784387,
83
+ "grad_norm": 0.7442412972450256,
84
+ "learning_rate": 1.272938443670151e-05,
85
+ "loss": 2.7949,
86
  "step": 1100
87
  },
88
  {
89
+ "epoch": 0.2788104089219331,
90
+ "grad_norm": 0.6129536628723145,
91
+ "learning_rate": 1.389082462253194e-05,
92
+ "loss": 2.6381,
93
  "step": 1200
94
  },
95
  {
96
+ "epoch": 0.3020446096654275,
97
+ "grad_norm": 0.5687291026115417,
98
+ "learning_rate": 1.5052264808362371e-05,
99
+ "loss": 2.4031,
100
  "step": 1300
101
  },
102
  {
103
+ "epoch": 0.3252788104089219,
104
+ "grad_norm": 0.6154528856277466,
105
+ "learning_rate": 1.62137049941928e-05,
106
+ "loss": 2.107,
107
  "step": 1400
108
  },
109
  {
110
+ "epoch": 0.34851301115241634,
111
+ "grad_norm": 0.8730382323265076,
112
+ "learning_rate": 1.7375145180023228e-05,
113
+ "loss": 1.981,
114
  "step": 1500
115
  },
116
  {
117
+ "epoch": 0.37174721189591076,
118
+ "grad_norm": 0.6668545603752136,
119
+ "learning_rate": 1.8536585365853663e-05,
120
+ "loss": 1.9311,
121
  "step": 1600
122
  },
123
  {
124
+ "epoch": 0.3949814126394052,
125
+ "grad_norm": 0.6021186709403992,
126
+ "learning_rate": 1.969802555168409e-05,
127
+ "loss": 1.8733,
128
  "step": 1700
129
  },
130
  {
131
+ "epoch": 0.41821561338289964,
132
+ "grad_norm": 0.8815124034881592,
133
+ "learning_rate": 1.9904479153220604e-05,
134
+ "loss": 1.8434,
135
  "step": 1800
136
  },
137
  {
138
+ "epoch": 0.44144981412639406,
139
+ "grad_norm": 1.1727079153060913,
140
+ "learning_rate": 1.9775396927843037e-05,
141
+ "loss": 1.8051,
142
  "step": 1900
143
  },
144
  {
145
+ "epoch": 0.4646840148698885,
146
+ "grad_norm": 1.1215996742248535,
147
+ "learning_rate": 1.964631470246547e-05,
148
+ "loss": 1.733,
149
  "step": 2000
150
  },
151
  {
152
+ "epoch": 0.4879182156133829,
153
+ "grad_norm": 1.1965365409851074,
154
+ "learning_rate": 1.9517232477087907e-05,
155
+ "loss": 1.6994,
156
  "step": 2100
157
  },
158
  {
159
+ "epoch": 0.5111524163568774,
160
+ "grad_norm": 1.2489936351776123,
161
+ "learning_rate": 1.938815025171034e-05,
162
+ "loss": 1.6529,
163
  "step": 2200
164
  },
165
  {
166
+ "epoch": 0.5343866171003717,
167
+ "grad_norm": 1.5988222360610962,
168
+ "learning_rate": 1.9259068026332776e-05,
169
+ "loss": 1.5897,
170
  "step": 2300
171
  },
172
  {
173
+ "epoch": 0.5576208178438662,
174
+ "grad_norm": 0.6558517217636108,
175
+ "learning_rate": 1.912998580095521e-05,
176
+ "loss": 1.5099,
177
  "step": 2400
178
  },
179
  {
180
+ "epoch": 0.5808550185873605,
181
+ "grad_norm": 0.7629631757736206,
182
+ "learning_rate": 1.900219439783142e-05,
183
+ "loss": 1.4466,
184
  "step": 2500
185
  },
186
  {
187
+ "epoch": 0.604089219330855,
188
+ "grad_norm": 0.9707331657409668,
189
+ "learning_rate": 1.8873112172453855e-05,
190
+ "loss": 1.3949,
191
  "step": 2600
192
  },
193
  {
194
+ "epoch": 0.6273234200743495,
195
+ "grad_norm": 0.849176287651062,
196
+ "learning_rate": 1.874402994707629e-05,
197
+ "loss": 1.3449,
198
  "step": 2700
199
  },
200
  {
201
+ "epoch": 0.6505576208178439,
202
+ "grad_norm": 0.460151731967926,
203
+ "learning_rate": 1.8614947721698724e-05,
204
+ "loss": 1.3182,
205
  "step": 2800
206
  },
207
  {
208
+ "epoch": 0.6737918215613383,
209
+ "grad_norm": 0.652923047542572,
210
+ "learning_rate": 1.8485865496321157e-05,
211
+ "loss": 1.2623,
212
  "step": 2900
213
  },
214
  {
215
+ "epoch": 0.6970260223048327,
216
+ "grad_norm": 0.5269683599472046,
217
+ "learning_rate": 1.8356783270943594e-05,
218
+ "loss": 1.2059,
219
  "step": 3000
220
  },
221
  {
222
+ "epoch": 0.7202602230483272,
223
+ "grad_norm": 0.6761623024940491,
224
+ "learning_rate": 1.8227701045566027e-05,
225
+ "loss": 1.1477,
226
  "step": 3100
227
  },
228
  {
229
+ "epoch": 0.7434944237918215,
230
+ "grad_norm": 0.4611155390739441,
231
+ "learning_rate": 1.809861882018846e-05,
232
+ "loss": 1.1063,
233
  "step": 3200
234
  },
235
  {
236
+ "epoch": 0.766728624535316,
237
+ "grad_norm": 1.20090913772583,
238
+ "learning_rate": 1.7969536594810896e-05,
239
+ "loss": 1.0812,
240
  "step": 3300
241
  },
242
  {
243
+ "epoch": 0.7899628252788105,
244
+ "grad_norm": 0.5198754072189331,
245
+ "learning_rate": 1.7840454369433332e-05,
246
+ "loss": 1.0637,
247
  "step": 3400
248
  },
249
  {
250
+ "epoch": 0.8131970260223048,
251
+ "grad_norm": 0.7287588119506836,
252
+ "learning_rate": 1.7711372144055765e-05,
253
+ "loss": 1.0311,
254
  "step": 3500
255
  },
256
  {
257
+ "epoch": 0.8364312267657993,
258
+ "grad_norm": 0.850121021270752,
259
+ "learning_rate": 1.75822899186782e-05,
260
+ "loss": 0.9687,
261
  "step": 3600
262
  },
263
  {
264
+ "epoch": 0.8596654275092936,
265
+ "grad_norm": 0.5256717801094055,
266
+ "learning_rate": 1.7453207693300635e-05,
267
+ "loss": 0.8706,
268
  "step": 3700
269
  },
270
  {
271
+ "epoch": 0.8828996282527881,
272
+ "grad_norm": 0.6515185236930847,
273
+ "learning_rate": 1.7324125467923068e-05,
274
+ "loss": 0.8474,
275
  "step": 3800
276
  },
277
  {
278
+ "epoch": 0.9061338289962825,
279
+ "grad_norm": 0.8604176640510559,
280
+ "learning_rate": 1.7195043242545504e-05,
281
+ "loss": 0.8302,
282
  "step": 3900
283
  },
284
  {
285
+ "epoch": 0.929368029739777,
286
+ "grad_norm": 0.3369189202785492,
287
+ "learning_rate": 1.7065961017167937e-05,
288
+ "loss": 0.7959,
289
  "step": 4000
290
  },
291
  {
292
+ "epoch": 0.9526022304832714,
293
+ "grad_norm": 0.4804532527923584,
294
+ "learning_rate": 1.6936878791790373e-05,
295
+ "loss": 0.7945,
296
  "step": 4100
297
  },
298
  {
299
+ "epoch": 0.9758364312267658,
300
+ "grad_norm": 0.3839660882949829,
301
+ "learning_rate": 1.6807796566412806e-05,
302
+ "loss": 0.7975,
303
  "step": 4200
304
  },
305
  {
306
+ "epoch": 0.9990706319702602,
307
+ "grad_norm": 0.31136325001716614,
308
+ "learning_rate": 1.667871434103524e-05,
309
+ "loss": 0.7804,
310
  "step": 4300
311
  },
312
  {
313
+ "epoch": 1.0223048327137547,
314
+ "grad_norm": 0.2822754681110382,
315
+ "learning_rate": 1.6549632115657676e-05,
316
+ "loss": 0.7502,
317
  "step": 4400
318
  },
319
  {
320
+ "epoch": 1.045539033457249,
321
+ "grad_norm": 0.3364527225494385,
322
+ "learning_rate": 1.6420549890280112e-05,
323
+ "loss": 0.747,
324
  "step": 4500
325
  },
326
  {
327
+ "epoch": 1.0687732342007434,
328
+ "grad_norm": 0.45242545008659363,
329
+ "learning_rate": 1.6291467664902545e-05,
330
+ "loss": 0.7263,
331
  "step": 4600
332
  },
333
  {
334
+ "epoch": 1.092007434944238,
335
+ "grad_norm": 0.2541595995426178,
336
+ "learning_rate": 1.6162385439524978e-05,
337
+ "loss": 0.7311,
338
  "step": 4700
339
  },
340
  {
341
+ "epoch": 1.1152416356877324,
342
+ "grad_norm": 0.32410866022109985,
343
+ "learning_rate": 1.6033303214147415e-05,
344
+ "loss": 0.7213,
345
  "step": 4800
346
  },
347
  {
348
+ "epoch": 1.1384758364312269,
349
+ "grad_norm": 0.28702208399772644,
350
+ "learning_rate": 1.5904220988769848e-05,
351
+ "loss": 0.7103,
352
  "step": 4900
353
  },
354
  {
355
+ "epoch": 1.161710037174721,
356
+ "grad_norm": 0.2637524902820587,
357
+ "learning_rate": 1.577513876339228e-05,
358
+ "loss": 0.7033,
359
  "step": 5000
360
  },
361
  {
362
+ "epoch": 1.1849442379182156,
363
+ "grad_norm": 0.38048645853996277,
364
+ "learning_rate": 1.5646056538014717e-05,
365
+ "loss": 0.7111,
366
  "step": 5100
367
  },
368
  {
369
+ "epoch": 1.20817843866171,
370
+ "grad_norm": 0.22926197946071625,
371
+ "learning_rate": 1.5516974312637153e-05,
372
+ "loss": 0.7053,
373
  "step": 5200
374
  },
375
  {
376
+ "epoch": 1.2314126394052045,
377
+ "grad_norm": 0.2666023373603821,
378
+ "learning_rate": 1.5387892087259586e-05,
379
+ "loss": 0.6915,
380
  "step": 5300
381
  },
382
  {
383
+ "epoch": 1.2546468401486988,
384
+ "grad_norm": 0.2618410587310791,
385
+ "learning_rate": 1.525880986188202e-05,
386
+ "loss": 0.6843,
387
  "step": 5400
388
  },
389
  {
390
+ "epoch": 1.2778810408921932,
391
+ "grad_norm": 0.24479706585407257,
392
+ "learning_rate": 1.5129727636504454e-05,
393
+ "loss": 0.6775,
394
  "step": 5500
395
  },
396
  {
397
+ "epoch": 1.3011152416356877,
398
+ "grad_norm": 0.19555561244487762,
399
+ "learning_rate": 1.5000645411126889e-05,
400
+ "loss": 0.6601,
401
  "step": 5600
402
  },
403
  {
404
+ "epoch": 1.3243494423791822,
405
+ "grad_norm": 0.2121550738811493,
406
+ "learning_rate": 1.4871563185749323e-05,
407
+ "loss": 0.6625,
408
  "step": 5700
409
  },
410
  {
411
+ "epoch": 1.3475836431226766,
412
+ "grad_norm": 0.36492133140563965,
413
+ "learning_rate": 1.474248096037176e-05,
414
+ "loss": 0.6567,
415
  "step": 5800
416
  },
417
  {
418
+ "epoch": 1.370817843866171,
419
+ "grad_norm": 0.28411343693733215,
420
+ "learning_rate": 1.4613398734994193e-05,
421
+ "loss": 0.6424,
422
  "step": 5900
423
  },
424
  {
425
+ "epoch": 1.3940520446096654,
426
+ "grad_norm": 0.3487832248210907,
427
+ "learning_rate": 1.4484316509616627e-05,
428
+ "loss": 0.6508,
429
  "step": 6000
430
  },
431
  {
432
+ "epoch": 1.4172862453531598,
433
+ "grad_norm": 0.4025629758834839,
434
+ "learning_rate": 1.4355234284239062e-05,
435
+ "loss": 0.6374,
436
  "step": 6100
437
  },
438
  {
439
+ "epoch": 1.4405204460966543,
440
+ "grad_norm": 0.31936919689178467,
441
+ "learning_rate": 1.4226152058861495e-05,
442
+ "loss": 0.6462,
443
  "step": 6200
444
  },
445
  {
446
+ "epoch": 1.4637546468401488,
447
+ "grad_norm": 0.27360206842422485,
448
+ "learning_rate": 1.409706983348393e-05,
449
+ "loss": 0.6382,
450
  "step": 6300
451
  },
452
  {
453
+ "epoch": 1.486988847583643,
454
+ "grad_norm": 0.35483697056770325,
455
+ "learning_rate": 1.3967987608106366e-05,
456
+ "loss": 0.6274,
457
  "step": 6400
458
  },
459
  {
460
+ "epoch": 1.5102230483271375,
461
+ "grad_norm": 0.30311813950538635,
462
+ "learning_rate": 1.38389053827288e-05,
463
+ "loss": 0.6258,
464
  "step": 6500
465
  },
466
  {
467
+ "epoch": 1.533457249070632,
468
+ "grad_norm": 0.3184954524040222,
469
+ "learning_rate": 1.3709823157351234e-05,
470
+ "loss": 0.6313,
471
  "step": 6600
472
  },
473
  {
474
+ "epoch": 1.5566914498141264,
475
+ "grad_norm": 0.2632908821105957,
476
+ "learning_rate": 1.3580740931973668e-05,
477
+ "loss": 0.6217,
478
  "step": 6700
479
  },
480
  {
481
+ "epoch": 1.579925650557621,
482
+ "grad_norm": 0.22145096957683563,
483
+ "learning_rate": 1.3451658706596103e-05,
484
+ "loss": 0.6245,
485
  "step": 6800
486
  },
487
  {
488
+ "epoch": 1.6031598513011152,
489
+ "grad_norm": 0.5008528828620911,
490
+ "learning_rate": 1.3322576481218536e-05,
491
+ "loss": 0.6187,
492
  "step": 6900
493
  },
494
  {
495
+ "epoch": 1.6263940520446096,
496
+ "grad_norm": 0.25452372431755066,
497
+ "learning_rate": 1.3193494255840972e-05,
498
+ "loss": 0.6084,
499
  "step": 7000
500
  },
501
  {
502
+ "epoch": 1.649628252788104,
503
+ "grad_norm": 0.3917735815048218,
504
+ "learning_rate": 1.3064412030463407e-05,
505
+ "loss": 0.6088,
506
  "step": 7100
507
  },
508
  {
509
+ "epoch": 1.6728624535315983,
510
+ "grad_norm": 0.28736940026283264,
511
+ "learning_rate": 1.2935329805085842e-05,
512
+ "loss": 0.6084,
513
  "step": 7200
514
  },
515
  {
516
+ "epoch": 1.696096654275093,
517
+ "grad_norm": 0.3900860548019409,
518
+ "learning_rate": 1.2807538401962051e-05,
519
+ "loss": 0.6017,
520
  "step": 7300
521
  },
522
  {
523
+ "epoch": 1.7193308550185873,
524
+ "grad_norm": 0.2482582926750183,
525
+ "learning_rate": 1.2678456176584486e-05,
526
+ "loss": 0.5964,
527
  "step": 7400
528
  },
529
  {
530
+ "epoch": 1.7425650557620818,
531
+ "grad_norm": 0.2464774250984192,
532
+ "learning_rate": 1.254937395120692e-05,
533
+ "loss": 0.5929,
534
  "step": 7500
535
  },
536
  {
537
+ "epoch": 1.7657992565055762,
538
+ "grad_norm": 0.36112162470817566,
539
+ "learning_rate": 1.2420291725829354e-05,
540
+ "loss": 0.5913,
541
  "step": 7600
542
  },
543
  {
544
+ "epoch": 1.7890334572490705,
545
+ "grad_norm": 0.30204829573631287,
546
+ "learning_rate": 1.2291209500451788e-05,
547
+ "loss": 0.5804,
548
  "step": 7700
549
  },
550
  {
551
+ "epoch": 1.8122676579925652,
552
+ "grad_norm": 0.2731075584888458,
553
+ "learning_rate": 1.2162127275074223e-05,
554
+ "loss": 0.5881,
555
  "step": 7800
556
  },
557
  {
558
+ "epoch": 1.8355018587360594,
559
+ "grad_norm": 0.24604862928390503,
560
+ "learning_rate": 1.2033045049696656e-05,
561
+ "loss": 0.5679,
562
  "step": 7900
563
  },
564
  {
565
+ "epoch": 1.858736059479554,
566
+ "grad_norm": 0.3449194133281708,
567
+ "learning_rate": 1.1903962824319092e-05,
568
+ "loss": 0.582,
569
  "step": 8000
570
  },
571
  {
572
+ "epoch": 1.8819702602230484,
573
+ "grad_norm": 0.310375452041626,
574
+ "learning_rate": 1.1774880598941527e-05,
575
+ "loss": 0.575,
576
  "step": 8100
577
  },
578
  {
579
+ "epoch": 1.9052044609665426,
580
+ "grad_norm": 0.28315114974975586,
581
+ "learning_rate": 1.1645798373563962e-05,
582
+ "loss": 0.5722,
583
  "step": 8200
584
  },
585
  {
586
+ "epoch": 1.9284386617100373,
587
+ "grad_norm": 0.3091906011104584,
588
+ "learning_rate": 1.1516716148186395e-05,
589
+ "loss": 0.5533,
590
  "step": 8300
591
  },
592
  {
593
+ "epoch": 1.9516728624535316,
594
+ "grad_norm": 0.28990840911865234,
595
+ "learning_rate": 1.138763392280883e-05,
596
+ "loss": 0.5724,
597
  "step": 8400
598
  },
599
  {
600
+ "epoch": 1.974907063197026,
601
+ "grad_norm": 0.44591304659843445,
602
+ "learning_rate": 1.1258551697431264e-05,
603
+ "loss": 0.5701,
604
  "step": 8500
605
  },
606
  {
607
+ "epoch": 1.9981412639405205,
608
+ "grad_norm": 0.26404786109924316,
609
+ "learning_rate": 1.11294694720537e-05,
610
+ "loss": 0.553,
611
  "step": 8600
612
+ },
613
+ {
614
+ "epoch": 2.0213754646840147,
615
+ "grad_norm": 0.2843058705329895,
616
+ "learning_rate": 1.1000387246676133e-05,
617
+ "loss": 0.5631,
618
+ "step": 8700
619
+ },
620
+ {
621
+ "epoch": 2.0446096654275094,
622
+ "grad_norm": 0.20029422640800476,
623
+ "learning_rate": 1.0871305021298568e-05,
624
+ "loss": 0.5495,
625
+ "step": 8800
626
+ },
627
+ {
628
+ "epoch": 2.0678438661710037,
629
+ "grad_norm": 0.26215997338294983,
630
+ "learning_rate": 1.0742222795921003e-05,
631
+ "loss": 0.5562,
632
+ "step": 8900
633
+ },
634
+ {
635
+ "epoch": 2.091078066914498,
636
+ "grad_norm": 0.29611942172050476,
637
+ "learning_rate": 1.0613140570543436e-05,
638
+ "loss": 0.5541,
639
+ "step": 9000
640
+ },
641
+ {
642
+ "epoch": 2.1143122676579926,
643
+ "grad_norm": 0.2809213697910309,
644
+ "learning_rate": 1.048405834516587e-05,
645
+ "loss": 0.5429,
646
+ "step": 9100
647
+ },
648
+ {
649
+ "epoch": 2.137546468401487,
650
+ "grad_norm": 0.4684973657131195,
651
+ "learning_rate": 1.0354976119788307e-05,
652
+ "loss": 0.5518,
653
+ "step": 9200
654
+ },
655
+ {
656
+ "epoch": 2.1607806691449816,
657
+ "grad_norm": 0.2790776193141937,
658
+ "learning_rate": 1.0225893894410741e-05,
659
+ "loss": 0.5485,
660
+ "step": 9300
661
+ },
662
+ {
663
+ "epoch": 2.184014869888476,
664
+ "grad_norm": 0.24624982476234436,
665
+ "learning_rate": 1.0096811669033174e-05,
666
+ "loss": 0.5434,
667
+ "step": 9400
668
+ },
669
+ {
670
+ "epoch": 2.20724907063197,
671
+ "grad_norm": 0.27161070704460144,
672
+ "learning_rate": 9.967729443655609e-06,
673
+ "loss": 0.5503,
674
+ "step": 9500
675
+ },
676
+ {
677
+ "epoch": 2.2304832713754648,
678
+ "grad_norm": 0.2635902166366577,
679
+ "learning_rate": 9.838647218278044e-06,
680
+ "loss": 0.538,
681
+ "step": 9600
682
+ },
683
+ {
684
+ "epoch": 2.253717472118959,
685
+ "grad_norm": 0.35729700326919556,
686
+ "learning_rate": 9.709564992900478e-06,
687
+ "loss": 0.5376,
688
+ "step": 9700
689
+ },
690
+ {
691
+ "epoch": 2.2769516728624537,
692
+ "grad_norm": 0.224281907081604,
693
+ "learning_rate": 9.580482767522913e-06,
694
+ "loss": 0.5423,
695
+ "step": 9800
696
+ },
697
+ {
698
+ "epoch": 2.300185873605948,
699
+ "grad_norm": 0.2016523778438568,
700
+ "learning_rate": 9.451400542145348e-06,
701
+ "loss": 0.54,
702
+ "step": 9900
703
+ },
704
+ {
705
+ "epoch": 2.323420074349442,
706
+ "grad_norm": 0.3719424605369568,
707
+ "learning_rate": 9.322318316767782e-06,
708
+ "loss": 0.5326,
709
+ "step": 10000
710
+ },
711
+ {
712
+ "epoch": 2.346654275092937,
713
+ "grad_norm": 0.22268572449684143,
714
+ "learning_rate": 9.193236091390217e-06,
715
+ "loss": 0.5379,
716
+ "step": 10100
717
+ },
718
+ {
719
+ "epoch": 2.369888475836431,
720
+ "grad_norm": 0.3181590735912323,
721
+ "learning_rate": 9.06415386601265e-06,
722
+ "loss": 0.5328,
723
+ "step": 10200
724
+ },
725
+ {
726
+ "epoch": 2.393122676579926,
727
+ "grad_norm": 0.2703763246536255,
728
+ "learning_rate": 8.935071640635087e-06,
729
+ "loss": 0.5276,
730
+ "step": 10300
731
+ },
732
+ {
733
+ "epoch": 2.41635687732342,
734
+ "grad_norm": 0.2698732912540436,
735
+ "learning_rate": 8.80598941525752e-06,
736
+ "loss": 0.5338,
737
+ "step": 10400
738
+ },
739
+ {
740
+ "epoch": 2.4395910780669143,
741
+ "grad_norm": 0.2765790820121765,
742
+ "learning_rate": 8.676907189879954e-06,
743
+ "loss": 0.5418,
744
+ "step": 10500
745
+ },
746
+ {
747
+ "epoch": 2.462825278810409,
748
+ "grad_norm": 0.36516493558883667,
749
+ "learning_rate": 8.547824964502389e-06,
750
+ "loss": 0.5249,
751
+ "step": 10600
752
+ },
753
+ {
754
+ "epoch": 2.4860594795539033,
755
+ "grad_norm": 0.23371903598308563,
756
+ "learning_rate": 8.418742739124824e-06,
757
+ "loss": 0.5318,
758
+ "step": 10700
759
+ },
760
+ {
761
+ "epoch": 2.5092936802973975,
762
+ "grad_norm": 0.23883387446403503,
763
+ "learning_rate": 8.289660513747258e-06,
764
+ "loss": 0.5336,
765
+ "step": 10800
766
+ },
767
+ {
768
+ "epoch": 2.532527881040892,
769
+ "grad_norm": 0.23600026965141296,
770
+ "learning_rate": 8.160578288369693e-06,
771
+ "loss": 0.5207,
772
+ "step": 10900
773
+ },
774
+ {
775
+ "epoch": 2.5557620817843865,
776
+ "grad_norm": 0.22283987700939178,
777
+ "learning_rate": 8.031496062992128e-06,
778
+ "loss": 0.5261,
779
+ "step": 11000
780
+ },
781
+ {
782
+ "epoch": 2.578996282527881,
783
+ "grad_norm": 0.3077383041381836,
784
+ "learning_rate": 7.90241383761456e-06,
785
+ "loss": 0.5117,
786
+ "step": 11100
787
+ },
788
+ {
789
+ "epoch": 2.6022304832713754,
790
+ "grad_norm": 0.24372899532318115,
791
+ "learning_rate": 7.773331612236995e-06,
792
+ "loss": 0.5251,
793
+ "step": 11200
794
+ },
795
+ {
796
+ "epoch": 2.6254646840148697,
797
+ "grad_norm": 0.3168962001800537,
798
+ "learning_rate": 7.64424938685943e-06,
799
+ "loss": 0.5238,
800
+ "step": 11300
801
+ },
802
+ {
803
+ "epoch": 2.6486988847583643,
804
+ "grad_norm": 0.2522094249725342,
805
+ "learning_rate": 7.515167161481865e-06,
806
+ "loss": 0.5141,
807
+ "step": 11400
808
+ },
809
+ {
810
+ "epoch": 2.6719330855018586,
811
+ "grad_norm": 0.4139024317264557,
812
+ "learning_rate": 7.3860849361042984e-06,
813
+ "loss": 0.5185,
814
+ "step": 11500
815
+ },
816
+ {
817
+ "epoch": 2.6951672862453533,
818
+ "grad_norm": 0.2781153619289398,
819
+ "learning_rate": 7.257002710726734e-06,
820
+ "loss": 0.5121,
821
+ "step": 11600
822
+ },
823
+ {
824
+ "epoch": 2.7184014869888475,
825
+ "grad_norm": 0.38515913486480713,
826
+ "learning_rate": 7.127920485349168e-06,
827
+ "loss": 0.5178,
828
+ "step": 11700
829
+ },
830
+ {
831
+ "epoch": 2.741635687732342,
832
+ "grad_norm": 0.33289971947669983,
833
+ "learning_rate": 6.998838259971602e-06,
834
+ "loss": 0.5124,
835
+ "step": 11800
836
+ },
837
+ {
838
+ "epoch": 2.7648698884758365,
839
+ "grad_norm": 0.36876046657562256,
840
+ "learning_rate": 6.871046856847813e-06,
841
+ "loss": 0.5137,
842
+ "step": 11900
843
+ },
844
+ {
845
+ "epoch": 2.7881040892193307,
846
+ "grad_norm": 0.28098130226135254,
847
+ "learning_rate": 6.7419646314702466e-06,
848
+ "loss": 0.509,
849
+ "step": 12000
850
+ },
851
+ {
852
+ "epoch": 2.8113382899628254,
853
+ "grad_norm": 0.32521939277648926,
854
+ "learning_rate": 6.612882406092681e-06,
855
+ "loss": 0.512,
856
+ "step": 12100
857
+ },
858
+ {
859
+ "epoch": 2.8345724907063197,
860
+ "grad_norm": 0.23627902567386627,
861
+ "learning_rate": 6.483800180715116e-06,
862
+ "loss": 0.5084,
863
+ "step": 12200
864
+ },
865
+ {
866
+ "epoch": 2.857806691449814,
867
+ "grad_norm": 0.23111554980278015,
868
+ "learning_rate": 6.354717955337551e-06,
869
+ "loss": 0.517,
870
+ "step": 12300
871
+ },
872
+ {
873
+ "epoch": 2.8810408921933086,
874
+ "grad_norm": 0.3062553107738495,
875
+ "learning_rate": 6.2256357299599844e-06,
876
+ "loss": 0.5063,
877
+ "step": 12400
878
+ },
879
+ {
880
+ "epoch": 2.904275092936803,
881
+ "grad_norm": 0.3274383842945099,
882
+ "learning_rate": 6.09655350458242e-06,
883
+ "loss": 0.5066,
884
+ "step": 12500
885
+ },
886
+ {
887
+ "epoch": 2.9275092936802976,
888
+ "grad_norm": 0.25803956389427185,
889
+ "learning_rate": 5.967471279204854e-06,
890
+ "loss": 0.5064,
891
+ "step": 12600
892
+ },
893
+ {
894
+ "epoch": 2.950743494423792,
895
+ "grad_norm": 0.29026666283607483,
896
+ "learning_rate": 5.838389053827288e-06,
897
+ "loss": 0.5088,
898
+ "step": 12700
899
+ },
900
+ {
901
+ "epoch": 2.973977695167286,
902
+ "grad_norm": 0.36228805780410767,
903
+ "learning_rate": 5.709306828449723e-06,
904
+ "loss": 0.507,
905
+ "step": 12800
906
+ },
907
+ {
908
+ "epoch": 2.9972118959107807,
909
+ "grad_norm": 0.2669726014137268,
910
+ "learning_rate": 5.580224603072157e-06,
911
+ "loss": 0.4934,
912
+ "step": 12900
913
+ },
914
+ {
915
+ "epoch": 3.020446096654275,
916
+ "grad_norm": 0.24396216869354248,
917
+ "learning_rate": 5.451142377694592e-06,
918
+ "loss": 0.5099,
919
+ "step": 13000
920
+ },
921
+ {
922
+ "epoch": 3.0436802973977697,
923
+ "grad_norm": 0.25540581345558167,
924
+ "learning_rate": 5.322060152317027e-06,
925
+ "loss": 0.5037,
926
+ "step": 13100
927
+ },
928
+ {
929
+ "epoch": 3.066914498141264,
930
+ "grad_norm": 0.1964583396911621,
931
+ "learning_rate": 5.192977926939461e-06,
932
+ "loss": 0.5055,
933
+ "step": 13200
934
+ },
935
+ {
936
+ "epoch": 3.090148698884758,
937
+ "grad_norm": 0.2318154275417328,
938
+ "learning_rate": 5.063895701561895e-06,
939
+ "loss": 0.5041,
940
+ "step": 13300
941
+ },
942
+ {
943
+ "epoch": 3.113382899628253,
944
+ "grad_norm": 0.28110265731811523,
945
+ "learning_rate": 4.9348134761843295e-06,
946
+ "loss": 0.5043,
947
+ "step": 13400
948
+ },
949
+ {
950
+ "epoch": 3.136617100371747,
951
+ "grad_norm": 0.3360753357410431,
952
+ "learning_rate": 4.805731250806764e-06,
953
+ "loss": 0.4915,
954
+ "step": 13500
955
+ },
956
+ {
957
+ "epoch": 3.159851301115242,
958
+ "grad_norm": 0.3044135868549347,
959
+ "learning_rate": 4.676649025429199e-06,
960
+ "loss": 0.499,
961
+ "step": 13600
962
+ },
963
+ {
964
+ "epoch": 3.183085501858736,
965
+ "grad_norm": 0.28163620829582214,
966
+ "learning_rate": 4.547566800051634e-06,
967
+ "loss": 0.4996,
968
+ "step": 13700
969
+ },
970
+ {
971
+ "epoch": 3.2063197026022303,
972
+ "grad_norm": 0.23853909969329834,
973
+ "learning_rate": 4.418484574674068e-06,
974
+ "loss": 0.5073,
975
+ "step": 13800
976
+ },
977
+ {
978
+ "epoch": 3.229553903345725,
979
+ "grad_norm": 0.25510174036026,
980
+ "learning_rate": 4.289402349296502e-06,
981
+ "loss": 0.4988,
982
+ "step": 13900
983
+ },
984
+ {
985
+ "epoch": 3.2527881040892193,
986
+ "grad_norm": 0.650174081325531,
987
+ "learning_rate": 4.160320123918937e-06,
988
+ "loss": 0.5024,
989
+ "step": 14000
990
+ },
991
+ {
992
+ "epoch": 3.276022304832714,
993
+ "grad_norm": 0.36293137073516846,
994
+ "learning_rate": 4.0312378985413715e-06,
995
+ "loss": 0.4913,
996
+ "step": 14100
997
+ },
998
+ {
999
+ "epoch": 3.299256505576208,
1000
+ "grad_norm": 0.35399818420410156,
1001
+ "learning_rate": 3.902155673163805e-06,
1002
+ "loss": 0.4993,
1003
+ "step": 14200
1004
+ },
1005
+ {
1006
+ "epoch": 3.3224907063197024,
1007
+ "grad_norm": 0.2553289830684662,
1008
+ "learning_rate": 3.7730734477862404e-06,
1009
+ "loss": 0.5017,
1010
+ "step": 14300
1011
+ },
1012
+ {
1013
+ "epoch": 3.345724907063197,
1014
+ "grad_norm": 0.25535061955451965,
1015
+ "learning_rate": 3.643991222408675e-06,
1016
+ "loss": 0.4895,
1017
+ "step": 14400
1018
+ },
1019
+ {
1020
+ "epoch": 3.3689591078066914,
1021
+ "grad_norm": 0.2772742509841919,
1022
+ "learning_rate": 3.514908997031109e-06,
1023
+ "loss": 0.4954,
1024
+ "step": 14500
1025
+ },
1026
+ {
1027
+ "epoch": 3.392193308550186,
1028
+ "grad_norm": 0.26105812191963196,
1029
+ "learning_rate": 3.387117593907319e-06,
1030
+ "loss": 0.4964,
1031
+ "step": 14600
1032
+ },
1033
+ {
1034
+ "epoch": 3.4154275092936803,
1035
+ "grad_norm": 0.2538992166519165,
1036
+ "learning_rate": 3.258035368529754e-06,
1037
+ "loss": 0.4985,
1038
+ "step": 14700
1039
+ },
1040
+ {
1041
+ "epoch": 3.4386617100371746,
1042
+ "grad_norm": 0.2889178693294525,
1043
+ "learning_rate": 3.128953143152188e-06,
1044
+ "loss": 0.4969,
1045
+ "step": 14800
1046
+ },
1047
+ {
1048
+ "epoch": 3.4618959107806693,
1049
+ "grad_norm": 0.28792130947113037,
1050
+ "learning_rate": 2.9998709177746228e-06,
1051
+ "loss": 0.4985,
1052
+ "step": 14900
1053
+ },
1054
+ {
1055
+ "epoch": 3.4851301115241635,
1056
+ "grad_norm": 0.36826494336128235,
1057
+ "learning_rate": 2.8707886923970575e-06,
1058
+ "loss": 0.4937,
1059
+ "step": 15000
1060
+ },
1061
+ {
1062
+ "epoch": 3.508364312267658,
1063
+ "grad_norm": 0.24432937800884247,
1064
+ "learning_rate": 2.7417064670194917e-06,
1065
+ "loss": 0.4892,
1066
+ "step": 15100
1067
+ },
1068
+ {
1069
+ "epoch": 3.5315985130111525,
1070
+ "grad_norm": 0.36436623334884644,
1071
+ "learning_rate": 2.6126242416419264e-06,
1072
+ "loss": 0.5029,
1073
+ "step": 15200
1074
+ },
1075
+ {
1076
+ "epoch": 3.5548327137546467,
1077
+ "grad_norm": 0.3257830739021301,
1078
+ "learning_rate": 2.4835420162643606e-06,
1079
+ "loss": 0.484,
1080
+ "step": 15300
1081
+ },
1082
+ {
1083
+ "epoch": 3.5780669144981414,
1084
+ "grad_norm": 0.20910651981830597,
1085
+ "learning_rate": 2.354459790886795e-06,
1086
+ "loss": 0.4934,
1087
+ "step": 15400
1088
+ },
1089
+ {
1090
+ "epoch": 3.6013011152416357,
1091
+ "grad_norm": 0.27706313133239746,
1092
+ "learning_rate": 2.2253775655092296e-06,
1093
+ "loss": 0.4972,
1094
+ "step": 15500
1095
+ },
1096
+ {
1097
+ "epoch": 3.6245353159851303,
1098
+ "grad_norm": 0.28043028712272644,
1099
+ "learning_rate": 2.0962953401316643e-06,
1100
+ "loss": 0.4878,
1101
+ "step": 15600
1102
+ },
1103
+ {
1104
+ "epoch": 3.6477695167286246,
1105
+ "grad_norm": 0.34835153818130493,
1106
+ "learning_rate": 1.9672131147540985e-06,
1107
+ "loss": 0.4954,
1108
+ "step": 15700
1109
+ },
1110
+ {
1111
+ "epoch": 3.671003717472119,
1112
+ "grad_norm": 0.3561202585697174,
1113
+ "learning_rate": 1.838130889376533e-06,
1114
+ "loss": 0.4992,
1115
+ "step": 15800
1116
+ },
1117
+ {
1118
+ "epoch": 3.6942379182156135,
1119
+ "grad_norm": 0.2767621576786041,
1120
+ "learning_rate": 1.7090486639989677e-06,
1121
+ "loss": 0.4982,
1122
+ "step": 15900
1123
+ },
1124
+ {
1125
+ "epoch": 3.717472118959108,
1126
+ "grad_norm": 0.22851090133190155,
1127
+ "learning_rate": 1.579966438621402e-06,
1128
+ "loss": 0.498,
1129
+ "step": 16000
1130
+ },
1131
+ {
1132
+ "epoch": 3.7407063197026025,
1133
+ "grad_norm": 0.28282201290130615,
1134
+ "learning_rate": 1.4508842132438364e-06,
1135
+ "loss": 0.4898,
1136
+ "step": 16100
1137
+ },
1138
+ {
1139
+ "epoch": 3.7639405204460967,
1140
+ "grad_norm": 0.24474182724952698,
1141
+ "learning_rate": 1.3218019878662709e-06,
1142
+ "loss": 0.501,
1143
+ "step": 16200
1144
+ },
1145
+ {
1146
+ "epoch": 3.787174721189591,
1147
+ "grad_norm": 0.27427938580513,
1148
+ "learning_rate": 1.1927197624887055e-06,
1149
+ "loss": 0.4966,
1150
+ "step": 16300
1151
+ },
1152
+ {
1153
+ "epoch": 3.8104089219330852,
1154
+ "grad_norm": 0.38391393423080444,
1155
+ "learning_rate": 1.0636375371111398e-06,
1156
+ "loss": 0.4941,
1157
+ "step": 16400
1158
+ },
1159
+ {
1160
+ "epoch": 3.83364312267658,
1161
+ "grad_norm": 0.3098974823951721,
1162
+ "learning_rate": 9.345553117335744e-07,
1163
+ "loss": 0.4879,
1164
+ "step": 16500
1165
+ },
1166
+ {
1167
+ "epoch": 3.8568773234200746,
1168
+ "grad_norm": 0.2817577123641968,
1169
+ "learning_rate": 8.054730863560088e-07,
1170
+ "loss": 0.4925,
1171
+ "step": 16600
1172
+ },
1173
+ {
1174
+ "epoch": 3.880111524163569,
1175
+ "grad_norm": 0.3037372827529907,
1176
+ "learning_rate": 6.763908609784433e-07,
1177
+ "loss": 0.4927,
1178
+ "step": 16700
1179
+ },
1180
+ {
1181
+ "epoch": 3.903345724907063,
1182
+ "grad_norm": 0.2850995659828186,
1183
+ "learning_rate": 5.473086356008779e-07,
1184
+ "loss": 0.4909,
1185
+ "step": 16800
1186
+ },
1187
+ {
1188
+ "epoch": 3.9265799256505574,
1189
+ "grad_norm": 0.25115731358528137,
1190
+ "learning_rate": 4.182264102233123e-07,
1191
+ "loss": 0.5,
1192
+ "step": 16900
1193
+ },
1194
+ {
1195
+ "epoch": 3.949814126394052,
1196
+ "grad_norm": 0.4323899745941162,
1197
+ "learning_rate": 2.8914418484574677e-07,
1198
+ "loss": 0.4861,
1199
+ "step": 17000
1200
+ },
1201
+ {
1202
+ "epoch": 3.9730483271375467,
1203
+ "grad_norm": 0.30076873302459717,
1204
+ "learning_rate": 1.6006195946818127e-07,
1205
+ "loss": 0.4855,
1206
+ "step": 17100
1207
+ },
1208
+ {
1209
+ "epoch": 3.996282527881041,
1210
+ "grad_norm": 0.2874129116535187,
1211
+ "learning_rate": 3.097973409061573e-08,
1212
+ "loss": 0.4957,
1213
+ "step": 17200
1214
  }
1215
  ],
1216
  "logging_steps": 100,
1217
+ "max_steps": 17216,
1218
  "num_input_tokens_seen": 0,
1219
  "num_train_epochs": 4,
1220
  "save_steps": 100,
 
1230
  "attributes": {}
1231
  }
1232
  },
1233
+ "total_flos": 3.805111076121907e+16,
1234
  "train_batch_size": 4,
1235
  "trial_name": null,
1236
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f25f4c63676c8f67c641355c20226b66c56f1324579ddcaea18b93da2d7de52
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:11a7f882a57ed2395a4921dd13204e513e9cf0ef0b5aff13aea0e0bf009fa0ce
3
  size 5432