furmaniak commited on
Commit
dde5ed9
·
verified ·
1 Parent(s): f87c629

End of training

Browse files
Files changed (5) hide show
  1. README.md +2 -1
  2. all_results.json +6 -6
  3. train_results.json +6 -6
  4. trainer_state.json +484 -785
  5. training_loss.png +0 -0
README.md CHANGED
@@ -4,6 +4,7 @@ license: apache-2.0
4
  base_model: Qwen/Qwen2.5-32B
5
  tags:
6
  - llama-factory
 
7
  - generated_from_trainer
8
  model-index:
9
  - name: pretrain
@@ -15,7 +16,7 @@ should probably proofread and complete it, then remove this comment. -->
15
 
16
  # pretrain
17
 
18
- This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on an unknown dataset.
19
 
20
  ## Model description
21
 
 
4
  base_model: Qwen/Qwen2.5-32B
5
  tags:
6
  - llama-factory
7
+ - full
8
  - generated_from_trainer
9
  model-index:
10
  - name: pretrain
 
16
 
17
  # pretrain
18
 
19
+ This model is a fine-tuned version of [Qwen/Qwen2.5-32B](https://huggingface.co/Qwen/Qwen2.5-32B) on the openalex dataset.
20
 
21
  ## Model description
22
 
all_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.9949787562765546,
3
- "total_flos": 1132817220108288.0,
4
- "train_loss": 0.578794286858221,
5
- "train_runtime": 20320.7913,
6
- "train_samples_per_second": 1.019,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
+ "epoch": 0.9952556668423828,
3
+ "total_flos": 1660937136242688.0,
4
+ "train_loss": 1.571211524939133,
5
+ "train_runtime": 47361.2024,
6
+ "train_samples_per_second": 0.641,
7
+ "train_steps_per_second": 0.002
8
  }
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.9949787562765546,
3
- "total_flos": 1132817220108288.0,
4
- "train_loss": 0.578794286858221,
5
- "train_runtime": 20320.7913,
6
- "train_samples_per_second": 1.019,
7
- "train_steps_per_second": 0.008
8
  }
 
1
  {
2
+ "epoch": 0.9952556668423828,
3
+ "total_flos": 1660937136242688.0,
4
+ "train_loss": 1.571211524939133,
5
+ "train_runtime": 47361.2024,
6
+ "train_samples_per_second": 0.641,
7
+ "train_steps_per_second": 0.002
8
  }
trainer_state.json CHANGED
@@ -1,1152 +1,851 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.9949787562765546,
5
  "eval_steps": 500,
6
- "global_step": 161,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.013114754098360656,
13
- "grad_norm": 0.043162938207387924,
14
- "learning_rate": 1.25e-05,
15
- "loss": 1.2441,
16
  "step": 1
17
  },
18
  {
19
- "epoch": 0.02622950819672131,
20
- "grad_norm": 0.043701257556676865,
21
- "learning_rate": 2.5e-05,
22
- "loss": 1.2477,
23
  "step": 2
24
  },
25
  {
26
- "epoch": 0.03934426229508197,
27
- "grad_norm": 0.027155233547091484,
28
- "learning_rate": 3.7500000000000003e-05,
29
- "loss": 1.2284,
30
  "step": 3
31
  },
32
  {
33
- "epoch": 0.05245901639344262,
34
- "grad_norm": 0.10691457986831665,
35
- "learning_rate": 5e-05,
36
- "loss": 1.2441,
37
  "step": 4
38
  },
39
  {
40
- "epoch": 0.06557377049180328,
41
- "grad_norm": 0.09258124232292175,
42
- "learning_rate": 6.25e-05,
43
- "loss": 1.2181,
44
  "step": 5
45
  },
46
  {
47
- "epoch": 0.07868852459016394,
48
- "grad_norm": 0.06767486780881882,
49
- "learning_rate": 7.500000000000001e-05,
50
- "loss": 1.2279,
51
  "step": 6
52
  },
53
  {
54
- "epoch": 0.09180327868852459,
55
- "grad_norm": 0.1602831929922104,
56
- "learning_rate": 8.75e-05,
57
- "loss": 1.2553,
58
  "step": 7
59
  },
60
  {
61
- "epoch": 0.10491803278688525,
62
- "grad_norm": 0.2144620418548584,
63
- "learning_rate": 0.0001,
64
- "loss": 1.2558,
65
  "step": 8
66
  },
67
  {
68
- "epoch": 0.1180327868852459,
69
- "grad_norm": 0.06662847101688385,
70
- "learning_rate": 9.994664874011863e-05,
71
- "loss": 1.2333,
72
  "step": 9
73
  },
74
  {
75
- "epoch": 0.13114754098360656,
76
- "grad_norm": 0.13297972083091736,
77
- "learning_rate": 9.978670881475172e-05,
78
- "loss": 1.2274,
79
  "step": 10
80
  },
81
  {
82
- "epoch": 0.14426229508196722,
83
- "grad_norm": 0.06226632371544838,
84
- "learning_rate": 9.952052154376026e-05,
85
- "loss": 1.2171,
86
  "step": 11
87
  },
88
  {
89
- "epoch": 0.15737704918032788,
90
- "grad_norm": 0.05960860103368759,
91
- "learning_rate": 9.91486549841951e-05,
92
- "loss": 1.2004,
93
  "step": 12
94
  },
95
  {
96
- "epoch": 0.17049180327868851,
97
- "grad_norm": 0.057807717472314835,
98
- "learning_rate": 9.867190271803465e-05,
99
- "loss": 1.1961,
100
  "step": 13
101
  },
102
  {
103
- "epoch": 0.18360655737704917,
104
- "grad_norm": 0.044687915593385696,
105
- "learning_rate": 9.809128215864097e-05,
106
- "loss": 1.2057,
107
  "step": 14
108
  },
109
  {
110
- "epoch": 0.19672131147540983,
111
- "grad_norm": 0.042382605373859406,
112
- "learning_rate": 9.74080323795483e-05,
113
- "loss": 1.2015,
114
  "step": 15
115
  },
116
  {
117
- "epoch": 0.2098360655737705,
118
- "grad_norm": 0.041327111423015594,
119
- "learning_rate": 9.662361147021779e-05,
120
- "loss": 1.1922,
121
  "step": 16
122
  },
123
  {
124
- "epoch": 0.22295081967213115,
125
- "grad_norm": 0.03643479198217392,
126
- "learning_rate": 9.573969342440106e-05,
127
- "loss": 1.1801,
128
  "step": 17
129
  },
130
  {
131
- "epoch": 0.2360655737704918,
132
- "grad_norm": 0.03201618418097496,
133
- "learning_rate": 9.475816456775313e-05,
134
- "loss": 1.1793,
135
  "step": 18
136
  },
137
  {
138
- "epoch": 0.24918032786885247,
139
- "grad_norm": 0.033190254122018814,
140
- "learning_rate": 9.368111953231848e-05,
141
- "loss": 1.1727,
142
  "step": 19
143
  },
144
  {
145
- "epoch": 0.26229508196721313,
146
- "grad_norm": 0.029199425131082535,
147
- "learning_rate": 9.251085678648072e-05,
148
- "loss": 1.1803,
149
  "step": 20
150
  },
151
  {
152
- "epoch": 0.2754098360655738,
153
- "grad_norm": 0.019889283925294876,
154
- "learning_rate": 9.124987372991511e-05,
155
- "loss": 1.1739,
156
  "step": 21
157
  },
158
  {
159
- "epoch": 0.28852459016393445,
160
- "grad_norm": 0.025394951924681664,
161
- "learning_rate": 8.9900861364012e-05,
162
- "loss": 1.1824,
163
  "step": 22
164
  },
165
  {
166
- "epoch": 0.3016393442622951,
167
- "grad_norm": 0.02561134099960327,
168
- "learning_rate": 8.846669854914396e-05,
169
- "loss": 1.1704,
170
  "step": 23
171
  },
172
  {
173
- "epoch": 0.31475409836065577,
174
- "grad_norm": 0.020040003582835197,
175
- "learning_rate": 8.695044586103296e-05,
176
- "loss": 1.1748,
177
  "step": 24
178
  },
179
  {
180
- "epoch": 0.32786885245901637,
181
- "grad_norm": 0.0246971994638443,
182
- "learning_rate": 8.535533905932738e-05,
183
- "loss": 1.1726,
184
  "step": 25
185
  },
186
  {
187
- "epoch": 0.34098360655737703,
188
- "grad_norm": 0.019360244274139404,
189
- "learning_rate": 8.368478218232787e-05,
190
- "loss": 1.1539,
191
  "step": 26
192
  },
193
  {
194
- "epoch": 0.3540983606557377,
195
- "grad_norm": 0.021348467096686363,
196
- "learning_rate": 8.194234028259806e-05,
197
- "loss": 1.1641,
198
  "step": 27
199
  },
200
  {
201
- "epoch": 0.36721311475409835,
202
- "grad_norm": 0.01926092617213726,
203
- "learning_rate": 8.013173181896283e-05,
204
- "loss": 1.1548,
205
  "step": 28
206
  },
207
  {
208
- "epoch": 0.380327868852459,
209
- "grad_norm": 0.018094880506396294,
210
- "learning_rate": 7.82568207211296e-05,
211
- "loss": 1.1595,
212
  "step": 29
213
  },
214
  {
215
- "epoch": 0.39344262295081966,
216
- "grad_norm": 0.019909674301743507,
217
- "learning_rate": 7.63216081438678e-05,
218
- "loss": 1.1488,
219
  "step": 30
220
  },
221
  {
222
- "epoch": 0.4065573770491803,
223
- "grad_norm": 0.015068020671606064,
224
- "learning_rate": 7.433022392834282e-05,
225
- "loss": 1.1518,
226
  "step": 31
227
  },
228
  {
229
- "epoch": 0.419672131147541,
230
- "grad_norm": 0.019493145868182182,
231
- "learning_rate": 7.228691778882693e-05,
232
- "loss": 1.1643,
233
  "step": 32
234
  },
235
  {
236
- "epoch": 0.43278688524590164,
237
- "grad_norm": 0.018127303570508957,
238
- "learning_rate": 7.019605024359474e-05,
239
- "loss": 1.1449,
240
  "step": 33
241
  },
242
  {
243
- "epoch": 0.4459016393442623,
244
- "grad_norm": 0.015173117630183697,
245
- "learning_rate": 6.806208330935766e-05,
246
- "loss": 1.1632,
247
  "step": 34
248
  },
249
  {
250
- "epoch": 0.45901639344262296,
251
- "grad_norm": 0.01737191341817379,
252
- "learning_rate": 6.588957097909508e-05,
253
- "loss": 1.1618,
254
  "step": 35
255
  },
256
  {
257
- "epoch": 0.4721311475409836,
258
- "grad_norm": 0.015316477976739407,
259
- "learning_rate": 6.368314950360415e-05,
260
- "loss": 1.1445,
261
  "step": 36
262
  },
263
  {
264
- "epoch": 0.4852459016393443,
265
- "grad_norm": 0.015012883581221104,
266
- "learning_rate": 6.14475274975067e-05,
267
- "loss": 1.1558,
268
  "step": 37
269
  },
270
  {
271
- "epoch": 0.49836065573770494,
272
- "grad_norm": 0.015383531339466572,
273
- "learning_rate": 5.918747589082853e-05,
274
- "loss": 1.1331,
275
  "step": 38
276
  },
277
  {
278
- "epoch": 0.5114754098360655,
279
- "grad_norm": 0.0135785061866045,
280
- "learning_rate": 5.6907817747594116e-05,
281
- "loss": 1.1423,
282
  "step": 39
283
  },
284
  {
285
- "epoch": 0.5245901639344263,
286
- "grad_norm": 0.014159608632326126,
287
- "learning_rate": 5.4613417973165106e-05,
288
- "loss": 1.1343,
289
  "step": 40
290
  },
291
  {
292
- "epoch": 0.5377049180327869,
293
- "grad_norm": 0.014230456203222275,
294
- "learning_rate": 5.230917293228699e-05,
295
- "loss": 1.1344,
296
  "step": 41
297
  },
298
  {
299
- "epoch": 0.5508196721311476,
300
- "grad_norm": 0.014486027881503105,
301
- "learning_rate": 5e-05,
302
- "loss": 1.1298,
303
  "step": 42
304
  },
305
  {
306
- "epoch": 0.5639344262295082,
307
- "grad_norm": 0.013846023939549923,
308
- "learning_rate": 4.7690827067713035e-05,
309
- "loss": 1.144,
310
  "step": 43
311
  },
312
  {
313
- "epoch": 0.5770491803278689,
314
- "grad_norm": 0.012692565098404884,
315
- "learning_rate": 4.5386582026834906e-05,
316
- "loss": 1.1317,
317
  "step": 44
318
  },
319
  {
320
- "epoch": 0.5901639344262295,
321
- "grad_norm": 0.012511651031672955,
322
- "learning_rate": 4.30921822524059e-05,
323
- "loss": 1.1495,
324
  "step": 45
325
  },
326
  {
327
- "epoch": 0.6032786885245902,
328
- "grad_norm": 0.012697260826826096,
329
- "learning_rate": 4.0812524109171476e-05,
330
- "loss": 1.1365,
331
  "step": 46
332
  },
333
  {
334
- "epoch": 0.6163934426229508,
335
- "grad_norm": 0.013609658926725388,
336
- "learning_rate": 3.855247250249331e-05,
337
- "loss": 1.1332,
338
  "step": 47
339
  },
340
  {
341
- "epoch": 0.6295081967213115,
342
- "grad_norm": 0.011829257011413574,
343
- "learning_rate": 3.631685049639586e-05,
344
- "loss": 1.1266,
345
  "step": 48
346
  },
347
  {
348
- "epoch": 0.6426229508196721,
349
- "grad_norm": 0.011731350794434547,
350
- "learning_rate": 3.411042902090492e-05,
351
- "loss": 1.1325,
352
  "step": 49
353
  },
354
  {
355
- "epoch": 0.6557377049180327,
356
- "grad_norm": 0.011319032870233059,
357
- "learning_rate": 3.1937916690642356e-05,
358
- "loss": 1.1227,
359
  "step": 50
360
  },
361
  {
362
- "epoch": 0.6688524590163935,
363
- "grad_norm": 0.012229708954691887,
364
- "learning_rate": 2.980394975640526e-05,
365
- "loss": 1.1338,
366
  "step": 51
367
  },
368
  {
369
- "epoch": 0.6819672131147541,
370
- "grad_norm": 0.010175776667892933,
371
- "learning_rate": 2.771308221117309e-05,
372
- "loss": 1.1144,
373
  "step": 52
374
  },
375
  {
376
- "epoch": 0.6950819672131148,
377
- "grad_norm": 0.01121637411415577,
378
- "learning_rate": 2.5669776071657192e-05,
379
- "loss": 1.1253,
380
  "step": 53
381
  },
382
  {
383
- "epoch": 0.7081967213114754,
384
- "grad_norm": 0.010852695442736149,
385
- "learning_rate": 2.3678391856132204e-05,
386
- "loss": 1.1344,
387
  "step": 54
388
  },
389
  {
390
- "epoch": 0.7213114754098361,
391
- "grad_norm": 0.010888871736824512,
392
- "learning_rate": 2.1743179278870407e-05,
393
- "loss": 1.1342,
394
  "step": 55
395
  },
396
  {
397
- "epoch": 0.7344262295081967,
398
- "grad_norm": 0.009675499983131886,
399
- "learning_rate": 1.9868268181037185e-05,
400
- "loss": 1.1233,
401
  "step": 56
402
  },
403
  {
404
- "epoch": 0.7475409836065574,
405
- "grad_norm": 0.0103612020611763,
406
- "learning_rate": 1.8057659717401947e-05,
407
- "loss": 1.128,
408
  "step": 57
409
  },
410
  {
411
- "epoch": 0.760655737704918,
412
- "grad_norm": 0.011313353665173054,
413
- "learning_rate": 1.631521781767214e-05,
414
- "loss": 1.1127,
415
  "step": 58
416
  },
417
  {
418
- "epoch": 0.7737704918032787,
419
- "grad_norm": 0.009756634011864662,
420
- "learning_rate": 1.4644660940672627e-05,
421
- "loss": 1.1207,
422
  "step": 59
423
  },
424
  {
425
- "epoch": 0.7868852459016393,
426
- "grad_norm": 0.009867743588984013,
427
- "learning_rate": 1.3049554138967051e-05,
428
- "loss": 1.1154,
429
  "step": 60
430
  },
431
  {
432
- "epoch": 0.8,
433
- "grad_norm": 0.00982784666121006,
434
- "learning_rate": 1.1533301450856054e-05,
435
- "loss": 1.1272,
436
  "step": 61
437
  },
438
  {
439
- "epoch": 0.8131147540983606,
440
- "grad_norm": 0.00951016042381525,
441
- "learning_rate": 1.0099138635988026e-05,
442
- "loss": 1.1139,
443
  "step": 62
444
  },
445
  {
446
- "epoch": 0.8262295081967214,
447
- "grad_norm": 0.010089361108839512,
448
- "learning_rate": 8.75012627008489e-06,
449
- "loss": 1.1304,
450
  "step": 63
451
  },
452
  {
453
- "epoch": 0.839344262295082,
454
- "grad_norm": 0.010168294422328472,
455
- "learning_rate": 7.489143213519301e-06,
456
- "loss": 1.109,
457
  "step": 64
458
  },
459
  {
460
- "epoch": 0.8524590163934426,
461
- "grad_norm": 0.010138073936104774,
462
- "learning_rate": 6.318880467681526e-06,
463
- "loss": 1.1259,
464
  "step": 65
465
  },
466
  {
467
- "epoch": 0.8655737704918033,
468
- "grad_norm": 0.009661810472607613,
469
- "learning_rate": 5.241835432246889e-06,
470
- "loss": 1.1184,
471
  "step": 66
472
  },
473
  {
474
- "epoch": 0.8786885245901639,
475
- "grad_norm": 0.009376097470521927,
476
- "learning_rate": 4.260306575598949e-06,
477
- "loss": 1.1214,
478
  "step": 67
479
  },
480
  {
481
- "epoch": 0.8918032786885246,
482
- "grad_norm": 0.009214168414473534,
483
- "learning_rate": 3.376388529782215e-06,
484
- "loss": 1.109,
485
  "step": 68
486
  },
487
  {
488
- "epoch": 0.9049180327868852,
489
- "grad_norm": 0.009044879116117954,
490
- "learning_rate": 2.591967620451707e-06,
491
- "loss": 1.1116,
492
  "step": 69
493
  },
494
  {
495
- "epoch": 0.9180327868852459,
496
- "grad_norm": 0.009314059279859066,
497
- "learning_rate": 1.908717841359048e-06,
498
- "loss": 1.1278,
499
  "step": 70
500
  },
501
  {
502
- "epoch": 0.9311475409836065,
503
- "grad_norm": 0.009015677496790886,
504
- "learning_rate": 1.328097281965357e-06,
505
- "loss": 1.1211,
506
  "step": 71
507
  },
508
  {
509
- "epoch": 0.9442622950819672,
510
- "grad_norm": 0.009113411419093609,
511
- "learning_rate": 8.513450158049108e-07,
512
- "loss": 1.1148,
513
  "step": 72
514
  },
515
  {
516
- "epoch": 0.9573770491803278,
517
- "grad_norm": 0.008978264406323433,
518
- "learning_rate": 4.794784562397458e-07,
519
- "loss": 1.1207,
520
  "step": 73
521
  },
522
  {
523
- "epoch": 0.9704918032786886,
524
- "grad_norm": 0.008966252207756042,
525
- "learning_rate": 2.1329118524827662e-07,
526
- "loss": 1.1231,
527
  "step": 74
528
  },
529
  {
530
- "epoch": 0.9836065573770492,
531
- "grad_norm": 0.00909092091023922,
532
- "learning_rate": 5.3351259881379014e-08,
533
- "loss": 1.1078,
534
  "step": 75
535
  },
536
  {
537
- "epoch": 0.9967213114754099,
538
- "grad_norm": 0.00918254442512989,
539
- "learning_rate": 0.0,
540
- "loss": 1.1241,
541
  "step": 76
542
  },
543
  {
544
- "epoch": 0.47585940517574354,
545
- "grad_norm": 0.00927089061588049,
546
- "learning_rate": 6.294095225512603e-05,
547
- "loss": 1.0998,
548
  "step": 77
549
  },
550
  {
551
- "epoch": 0.4820393974507532,
552
- "grad_norm": 0.009278366342186928,
553
- "learning_rate": 6.188429461630866e-05,
554
- "loss": 1.0809,
555
  "step": 78
556
  },
557
  {
558
- "epoch": 0.48821938972576284,
559
- "grad_norm": 0.009307453408837318,
560
- "learning_rate": 6.0821980696905146e-05,
561
- "loss": 1.1079,
562
  "step": 79
563
  },
564
  {
565
- "epoch": 0.4943993820007725,
566
- "grad_norm": 0.008874714374542236,
567
- "learning_rate": 5.9754516100806423e-05,
568
- "loss": 1.0846,
569
  "step": 80
570
  },
571
  {
572
- "epoch": 0.5005793742757821,
573
- "grad_norm": 0.00868895836174488,
574
- "learning_rate": 5.868240888334653e-05,
575
- "loss": 1.0991,
576
  "step": 81
577
  },
578
  {
579
- "epoch": 0.5067593665507918,
580
- "grad_norm": 0.008949129842221737,
581
- "learning_rate": 5.7606169309495836e-05,
582
- "loss": 1.107,
583
  "step": 82
584
  },
585
  {
586
- "epoch": 0.5129393588258014,
587
- "grad_norm": 0.009207559749484062,
588
- "learning_rate": 5.6526309611002594e-05,
589
- "loss": 1.1034,
590
  "step": 83
591
  },
592
  {
593
- "epoch": 0.5191193511008111,
594
- "grad_norm": 0.009371085092425346,
595
- "learning_rate": 5.544334374259823e-05,
596
- "loss": 1.0936,
597
  "step": 84
598
  },
599
  {
600
- "epoch": 0.5252993433758207,
601
- "grad_norm": 0.009222784079611301,
602
- "learning_rate": 5.435778713738292e-05,
603
- "loss": 1.0909,
604
  "step": 85
605
  },
606
  {
607
- "epoch": 0.5314793356508304,
608
- "grad_norm": 0.00895879790186882,
609
- "learning_rate": 5.327015646150716e-05,
610
- "loss": 1.0871,
611
  "step": 86
612
  },
613
  {
614
- "epoch": 0.5376593279258401,
615
- "grad_norm": 0.008927428163588047,
616
- "learning_rate": 5.218096936826681e-05,
617
- "loss": 1.0917,
618
  "step": 87
619
  },
620
  {
621
- "epoch": 0.5438393202008498,
622
- "grad_norm": 0.00859418697655201,
623
- "learning_rate": 5.1090744251728064e-05,
624
- "loss": 1.1013,
625
  "step": 88
626
  },
627
  {
628
- "epoch": 0.5500193124758594,
629
- "grad_norm": 0.009128894656896591,
630
- "learning_rate": 5e-05,
631
- "loss": 1.0948,
632
  "step": 89
633
  },
634
  {
635
- "epoch": 0.5561993047508691,
636
- "grad_norm": 0.008752775378525257,
637
- "learning_rate": 4.890925574827195e-05,
638
- "loss": 1.103,
639
  "step": 90
640
  },
641
  {
642
- "epoch": 0.5623792970258787,
643
- "grad_norm": 0.009119733236730099,
644
- "learning_rate": 4.781903063173321e-05,
645
- "loss": 1.0858,
646
  "step": 91
647
  },
648
  {
649
- "epoch": 0.5685592893008884,
650
- "grad_norm": 0.009288666769862175,
651
- "learning_rate": 4.6729843538492847e-05,
652
- "loss": 1.0867,
653
  "step": 92
654
  },
655
  {
656
- "epoch": 0.574739281575898,
657
- "grad_norm": 0.0089786471799016,
658
- "learning_rate": 4.564221286261709e-05,
659
- "loss": 1.0861,
660
  "step": 93
661
  },
662
  {
663
- "epoch": 0.5809192738509077,
664
- "grad_norm": 0.008815642446279526,
665
- "learning_rate": 4.4556656257401786e-05,
666
- "loss": 1.0981,
667
  "step": 94
668
  },
669
  {
670
- "epoch": 0.5870992661259173,
671
- "grad_norm": 0.00881979987025261,
672
- "learning_rate": 4.347369038899744e-05,
673
- "loss": 1.1144,
674
  "step": 95
675
  },
676
  {
677
- "epoch": 0.593279258400927,
678
- "grad_norm": 0.009116360917687416,
679
- "learning_rate": 4.239383069050417e-05,
680
- "loss": 1.1074,
681
  "step": 96
682
  },
683
  {
684
- "epoch": 0.5994592506759366,
685
- "grad_norm": 0.008931254036724567,
686
- "learning_rate": 4.131759111665349e-05,
687
- "loss": 1.1069,
688
  "step": 97
689
  },
690
  {
691
- "epoch": 0.6056392429509463,
692
- "grad_norm": 0.00889168307185173,
693
- "learning_rate": 4.0245483899193595e-05,
694
- "loss": 1.1113,
695
  "step": 98
696
  },
697
  {
698
- "epoch": 0.6118192352259559,
699
- "grad_norm": 0.008884157054126263,
700
- "learning_rate": 3.917801930309486e-05,
701
- "loss": 1.0798,
702
  "step": 99
703
  },
704
  {
705
- "epoch": 0.6179992275009656,
706
- "grad_norm": 0.008808060549199581,
707
- "learning_rate": 3.8115705383691355e-05,
708
- "loss": 1.0835,
709
  "step": 100
710
  },
711
  {
712
- "epoch": 0.6241792197759752,
713
- "grad_norm": 0.009600223042070866,
714
- "learning_rate": 3.705904774487396e-05,
715
- "loss": 1.0937,
716
  "step": 101
717
  },
718
  {
719
- "epoch": 0.6303592120509849,
720
- "grad_norm": 0.009098890237510204,
721
- "learning_rate": 3.60085492984504e-05,
722
- "loss": 1.1008,
723
  "step": 102
724
  },
725
  {
726
- "epoch": 0.6365392043259946,
727
- "grad_norm": 0.009177979081869125,
728
- "learning_rate": 3.4964710024786354e-05,
729
- "loss": 1.096,
730
  "step": 103
731
  },
732
  {
733
- "epoch": 0.6427191966010043,
734
- "grad_norm": 0.008857106789946556,
735
- "learning_rate": 3.392802673484193e-05,
736
- "loss": 1.0815,
737
  "step": 104
738
  },
739
  {
740
- "epoch": 0.6488991888760139,
741
- "grad_norm": 0.009007126092910767,
742
- "learning_rate": 3.289899283371657e-05,
743
- "loss": 1.091,
744
  "step": 105
745
  },
746
  {
747
- "epoch": 0.6550791811510236,
748
- "grad_norm": 0.009332729503512383,
749
- "learning_rate": 3.1878098085814924e-05,
750
- "loss": 1.0834,
751
  "step": 106
752
  },
753
  {
754
- "epoch": 0.6612591734260332,
755
- "grad_norm": 0.009046237915754318,
756
- "learning_rate": 3.086582838174551e-05,
757
- "loss": 1.0844,
758
  "step": 107
759
  },
760
  {
761
- "epoch": 0.6674391657010429,
762
- "grad_norm": 0.008926077745854855,
763
- "learning_rate": 2.9862665507063147e-05,
764
- "loss": 1.0649,
765
  "step": 108
766
  },
767
  {
768
- "epoch": 0.6736191579760525,
769
- "grad_norm": 0.00914798304438591,
770
- "learning_rate": 2.886908691296504e-05,
771
- "loss": 1.0912,
772
  "step": 109
773
  },
774
  {
775
- "epoch": 0.6797991502510622,
776
- "grad_norm": 0.008840657770633698,
777
- "learning_rate": 2.7885565489049946e-05,
778
- "loss": 1.085,
779
  "step": 110
780
  },
781
  {
782
- "epoch": 0.6859791425260718,
783
- "grad_norm": 0.009009969420731068,
784
- "learning_rate": 2.6912569338248315e-05,
785
- "loss": 1.0945,
786
  "step": 111
787
  },
788
  {
789
- "epoch": 0.6921591348010815,
790
- "grad_norm": 0.008585930801928043,
791
- "learning_rate": 2.595056155403063e-05,
792
- "loss": 1.0973,
793
  "step": 112
794
  },
795
  {
796
- "epoch": 0.6983391270760911,
797
- "grad_norm": 0.009383322671055794,
798
- "learning_rate": 2.500000000000001e-05,
799
- "loss": 1.0918,
800
  "step": 113
801
  },
802
  {
803
- "epoch": 0.7045191193511008,
804
- "grad_norm": 0.009045167826116085,
805
- "learning_rate": 2.4061337091973918e-05,
806
- "loss": 1.1037,
807
  "step": 114
808
  },
809
  {
810
- "epoch": 0.7106991116261104,
811
- "grad_norm": 0.009319834411144257,
812
- "learning_rate": 2.3135019582658802e-05,
813
- "loss": 1.1059,
814
  "step": 115
815
  },
816
  {
817
- "epoch": 0.7168791039011201,
818
- "grad_norm": 0.008737134747207165,
819
- "learning_rate": 2.2221488349019903e-05,
820
- "loss": 1.0922,
821
  "step": 116
822
  },
823
  {
824
- "epoch": 0.7230590961761297,
825
- "grad_norm": 0.009303976781666279,
826
- "learning_rate": 2.132117818244771e-05,
827
- "loss": 1.0925,
828
  "step": 117
829
  },
830
  {
831
- "epoch": 0.7292390884511394,
832
- "grad_norm": 0.009160283021628857,
833
- "learning_rate": 2.0434517581820896e-05,
834
- "loss": 1.1057,
835
- "step": 118
836
- },
837
- {
838
- "epoch": 0.7354190807261491,
839
- "grad_norm": 0.009896110743284225,
840
- "learning_rate": 1.9561928549563968e-05,
841
- "loss": 1.1058,
842
- "step": 119
843
- },
844
- {
845
- "epoch": 0.7415990730011588,
846
- "grad_norm": 0.008766653947532177,
847
- "learning_rate": 1.8703826390797048e-05,
848
- "loss": 1.0959,
849
- "step": 120
850
- },
851
- {
852
- "epoch": 0.7477790652761684,
853
- "grad_norm": 0.008967863395810127,
854
- "learning_rate": 1.7860619515673033e-05,
855
- "loss": 1.1026,
856
- "step": 121
857
- },
858
- {
859
- "epoch": 0.7539590575511781,
860
- "grad_norm": 0.009152066893875599,
861
- "learning_rate": 1.703270924499656e-05,
862
- "loss": 1.0926,
863
- "step": 122
864
- },
865
- {
866
- "epoch": 0.7601390498261877,
867
- "grad_norm": 0.008764652535319328,
868
- "learning_rate": 1.622048961921699e-05,
869
- "loss": 1.0756,
870
- "step": 123
871
- },
872
- {
873
- "epoch": 0.7663190421011974,
874
- "grad_norm": 0.009184801019728184,
875
- "learning_rate": 1.5424347210886538e-05,
876
- "loss": 1.1013,
877
- "step": 124
878
- },
879
- {
880
- "epoch": 0.772499034376207,
881
- "grad_norm": 0.009281960316002369,
882
- "learning_rate": 1.4644660940672627e-05,
883
- "loss": 1.1041,
884
- "step": 125
885
- },
886
- {
887
- "epoch": 0.7786790266512167,
888
- "grad_norm": 0.009053783491253853,
889
- "learning_rate": 1.3881801897012225e-05,
890
- "loss": 1.1018,
891
- "step": 126
892
- },
893
- {
894
- "epoch": 0.7848590189262263,
895
- "grad_norm": 0.008994681760668755,
896
- "learning_rate": 1.3136133159493802e-05,
897
- "loss": 1.0951,
898
- "step": 127
899
- },
900
- {
901
- "epoch": 0.791039011201236,
902
- "grad_norm": 0.008957086130976677,
903
- "learning_rate": 1.2408009626051137e-05,
904
- "loss": 1.0848,
905
- "step": 128
906
- },
907
- {
908
- "epoch": 0.7972190034762456,
909
- "grad_norm": 0.008901839144527912,
910
- "learning_rate": 1.1697777844051105e-05,
911
- "loss": 1.0876,
912
- "step": 129
913
- },
914
- {
915
- "epoch": 0.8033989957512553,
916
- "grad_norm": 0.009184077382087708,
917
- "learning_rate": 1.100577584535592e-05,
918
- "loss": 1.0919,
919
- "step": 130
920
- },
921
- {
922
- "epoch": 0.8095789880262649,
923
- "grad_norm": 0.008814208209514618,
924
- "learning_rate": 1.0332332985438248e-05,
925
- "loss": 1.092,
926
- "step": 131
927
- },
928
- {
929
- "epoch": 0.8157589803012746,
930
- "grad_norm": 0.009356915950775146,
931
- "learning_rate": 9.677769786625867e-06,
932
- "loss": 1.092,
933
- "step": 132
934
- },
935
- {
936
- "epoch": 0.8219389725762842,
937
- "grad_norm": 0.009066778235137463,
938
- "learning_rate": 9.042397785550405e-06,
939
- "loss": 1.1062,
940
- "step": 133
941
- },
942
- {
943
- "epoch": 0.8281189648512939,
944
- "grad_norm": 0.009054549038410187,
945
- "learning_rate": 8.426519384872733e-06,
946
- "loss": 1.0959,
947
- "step": 134
948
- },
949
- {
950
- "epoch": 0.8342989571263036,
951
- "grad_norm": 0.009238997474312782,
952
- "learning_rate": 7.830427709355725e-06,
953
- "loss": 1.1261,
954
- "step": 135
955
- },
956
- {
957
- "epoch": 0.8404789494013133,
958
- "grad_norm": 0.009531921707093716,
959
- "learning_rate": 7.2544064663526815e-06,
960
- "loss": 1.1119,
961
- "step": 136
962
- },
963
- {
964
- "epoch": 0.8466589416763229,
965
- "grad_norm": 0.008905571885406971,
966
- "learning_rate": 6.698729810778065e-06,
967
- "loss": 1.0965,
968
- "step": 137
969
- },
970
- {
971
- "epoch": 0.8528389339513326,
972
- "grad_norm": 0.008772294037044048,
973
- "learning_rate": 6.163662214624616e-06,
974
- "loss": 1.0972,
975
- "step": 138
976
- },
977
- {
978
- "epoch": 0.8590189262263422,
979
- "grad_norm": 0.008754718117415905,
980
- "learning_rate": 5.649458341088915e-06,
981
- "loss": 1.0918,
982
- "step": 139
983
- },
984
- {
985
- "epoch": 0.8651989185013519,
986
- "grad_norm": 0.008972358889877796,
987
- "learning_rate": 5.156362923365588e-06,
988
- "loss": 1.1049,
989
- "step": 140
990
- },
991
- {
992
- "epoch": 0.8713789107763615,
993
- "grad_norm": 0.00903693214058876,
994
- "learning_rate": 4.684610648167503e-06,
995
- "loss": 1.0926,
996
- "step": 141
997
- },
998
- {
999
- "epoch": 0.8775589030513712,
1000
- "grad_norm": 0.009000574238598347,
1001
- "learning_rate": 4.234426044027645e-06,
1002
- "loss": 1.1078,
1003
- "step": 142
1004
- },
1005
- {
1006
- "epoch": 0.8837388953263808,
1007
- "grad_norm": 0.009073416702449322,
1008
- "learning_rate": 3.8060233744356633e-06,
1009
- "loss": 1.111,
1010
- "step": 143
1011
- },
1012
- {
1013
- "epoch": 0.8899188876013905,
1014
- "grad_norm": 0.009199617430567741,
1015
- "learning_rate": 3.3996065358600782e-06,
1016
- "loss": 1.0996,
1017
- "step": 144
1018
- },
1019
- {
1020
- "epoch": 0.8960988798764001,
1021
- "grad_norm": 0.009446380659937859,
1022
- "learning_rate": 3.0153689607045845e-06,
1023
- "loss": 1.0956,
1024
- "step": 145
1025
- },
1026
- {
1027
- "epoch": 0.9022788721514098,
1028
- "grad_norm": 0.00881500355899334,
1029
- "learning_rate": 2.653493525244721e-06,
1030
- "loss": 1.0953,
1031
- "step": 146
1032
- },
1033
- {
1034
- "epoch": 0.9084588644264194,
1035
- "grad_norm": 0.009240192361176014,
1036
- "learning_rate": 2.314152462588659e-06,
1037
- "loss": 1.1046,
1038
- "step": 147
1039
- },
1040
- {
1041
- "epoch": 0.9146388567014291,
1042
- "grad_norm": 0.009173831902444363,
1043
- "learning_rate": 1.99750728070357e-06,
1044
- "loss": 1.097,
1045
- "step": 148
1046
- },
1047
- {
1048
- "epoch": 0.9208188489764387,
1049
- "grad_norm": 0.009656915441155434,
1050
- "learning_rate": 1.70370868554659e-06,
1051
- "loss": 1.0777,
1052
- "step": 149
1053
- },
1054
- {
1055
- "epoch": 0.9269988412514485,
1056
- "grad_norm": 0.008921938017010689,
1057
- "learning_rate": 1.4328965093369283e-06,
1058
- "loss": 1.0916,
1059
- "step": 150
1060
- },
1061
- {
1062
- "epoch": 0.9331788335264581,
1063
- "grad_norm": 0.009205098263919353,
1064
- "learning_rate": 1.1851996440033319e-06,
1065
- "loss": 1.1057,
1066
- "step": 151
1067
- },
1068
- {
1069
- "epoch": 0.9393588258014678,
1070
- "grad_norm": 0.008920296095311642,
1071
- "learning_rate": 9.607359798384785e-07,
1072
- "loss": 1.0906,
1073
- "step": 152
1074
- },
1075
- {
1076
- "epoch": 0.9455388180764774,
1077
- "grad_norm": 0.009275338612496853,
1078
- "learning_rate": 7.596123493895991e-07,
1079
- "loss": 1.1111,
1080
- "step": 153
1081
- },
1082
- {
1083
- "epoch": 0.9517188103514871,
1084
- "grad_norm": 0.008771958760917187,
1085
- "learning_rate": 5.81924476611967e-07,
1086
- "loss": 1.1001,
1087
- "step": 154
1088
- },
1089
- {
1090
- "epoch": 0.9578988026264967,
1091
- "grad_norm": 0.009109330363571644,
1092
- "learning_rate": 4.277569313094809e-07,
1093
- "loss": 1.0804,
1094
- "step": 155
1095
- },
1096
- {
1097
- "epoch": 0.9640787949015064,
1098
- "grad_norm": 0.009273674339056015,
1099
- "learning_rate": 2.971830888840177e-07,
1100
- "loss": 1.0919,
1101
- "step": 156
1102
- },
1103
- {
1104
- "epoch": 0.970258787176516,
1105
- "grad_norm": 0.008920193649828434,
1106
- "learning_rate": 1.9026509541272275e-07,
1107
- "loss": 1.0908,
1108
- "step": 157
1109
- },
1110
- {
1111
- "epoch": 0.9764387794515257,
1112
- "grad_norm": 0.008690367452800274,
1113
- "learning_rate": 1.0705383806982606e-07,
1114
- "loss": 1.1054,
1115
- "step": 158
1116
- },
1117
- {
1118
- "epoch": 0.9826187717265353,
1119
- "grad_norm": 0.009283354505896568,
1120
- "learning_rate": 4.7588920907110094e-08,
1121
- "loss": 1.1086,
1122
- "step": 159
1123
- },
1124
- {
1125
- "epoch": 0.988798764001545,
1126
- "grad_norm": 0.009044487960636616,
1127
- "learning_rate": 1.189864600454338e-08,
1128
- "loss": 1.117,
1129
- "step": 160
1130
- },
1131
- {
1132
- "epoch": 0.9949787562765546,
1133
- "grad_norm": 0.009743698872625828,
1134
  "learning_rate": 0.0,
1135
- "loss": 1.0973,
1136
- "step": 161
1137
  },
1138
  {
1139
- "epoch": 0.9949787562765546,
1140
- "step": 161,
1141
- "total_flos": 1132817220108288.0,
1142
- "train_loss": 0.578794286858221,
1143
- "train_runtime": 20320.7913,
1144
- "train_samples_per_second": 1.019,
1145
- "train_steps_per_second": 0.008
1146
  }
1147
  ],
1148
  "logging_steps": 1,
1149
- "max_steps": 161,
1150
  "num_input_tokens_seen": 0,
1151
  "num_train_epochs": 1,
1152
  "save_steps": 100,
@@ -1162,8 +861,8 @@
1162
  "attributes": {}
1163
  }
1164
  },
1165
- "total_flos": 1132817220108288.0,
1166
- "train_batch_size": 1,
1167
  "trial_name": null,
1168
  "trial_params": null
1169
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.9952556668423828,
5
  "eval_steps": 500,
6
+ "global_step": 118,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.008434370057986295,
13
+ "grad_norm": 0.08799133449792862,
14
+ "learning_rate": 4.9999999999999996e-06,
15
+ "loss": 1.6351,
16
  "step": 1
17
  },
18
  {
19
+ "epoch": 0.01686874011597259,
20
+ "grad_norm": 0.08821269869804382,
21
+ "learning_rate": 9.999999999999999e-06,
22
+ "loss": 1.6405,
23
  "step": 2
24
  },
25
  {
26
+ "epoch": 0.025303110173958882,
27
+ "grad_norm": 0.028541648760437965,
28
+ "learning_rate": 1.5e-05,
29
+ "loss": 1.6264,
30
  "step": 3
31
  },
32
  {
33
+ "epoch": 0.03373748023194518,
34
+ "grad_norm": 0.016522206366062164,
35
+ "learning_rate": 1.9999999999999998e-05,
36
+ "loss": 1.6233,
37
  "step": 4
38
  },
39
  {
40
+ "epoch": 0.04217185028993147,
41
+ "grad_norm": 0.054906539618968964,
42
+ "learning_rate": 2.5e-05,
43
+ "loss": 1.62,
44
  "step": 5
45
  },
46
  {
47
+ "epoch": 0.050606220347917764,
48
+ "grad_norm": 0.0514790378510952,
49
+ "learning_rate": 3e-05,
50
+ "loss": 1.6134,
51
  "step": 6
52
  },
53
  {
54
+ "epoch": 0.05904059040590406,
55
+ "grad_norm": 0.04156072437763214,
56
+ "learning_rate": 3.5000000000000004e-05,
57
+ "loss": 1.6169,
58
  "step": 7
59
  },
60
  {
61
+ "epoch": 0.06747496046389036,
62
+ "grad_norm": 0.05689298361539841,
63
+ "learning_rate": 3.9999999999999996e-05,
64
+ "loss": 1.6143,
65
  "step": 8
66
  },
67
  {
68
+ "epoch": 0.07590933052187665,
69
+ "grad_norm": 0.041525840759277344,
70
+ "learning_rate": 4.5e-05,
71
+ "loss": 1.6104,
72
  "step": 9
73
  },
74
  {
75
+ "epoch": 0.08434370057986294,
76
+ "grad_norm": 0.031016899272799492,
77
+ "learning_rate": 5e-05,
78
+ "loss": 1.6028,
79
  "step": 10
80
  },
81
  {
82
+ "epoch": 0.09277807063784924,
83
+ "grad_norm": 0.03775344789028168,
84
+ "learning_rate": 5.5e-05,
85
+ "loss": 1.5949,
86
  "step": 11
87
  },
88
  {
89
+ "epoch": 0.10121244069583553,
90
+ "grad_norm": 0.027061201632022858,
91
+ "learning_rate": 6e-05,
92
+ "loss": 1.5966,
93
  "step": 12
94
  },
95
  {
96
+ "epoch": 0.10964681075382182,
97
+ "grad_norm": 0.03555454686284065,
98
+ "learning_rate": 5.998682509526384e-05,
99
+ "loss": 1.601,
100
  "step": 13
101
  },
102
  {
103
+ "epoch": 0.11808118081180811,
104
+ "grad_norm": 0.038648299872875214,
105
+ "learning_rate": 5.994731195292965e-05,
106
+ "loss": 1.6015,
107
  "step": 14
108
  },
109
  {
110
+ "epoch": 0.1265155508697944,
111
+ "grad_norm": 0.03883035108447075,
112
+ "learning_rate": 5.988149527845651e-05,
113
+ "loss": 1.5992,
114
  "step": 15
115
  },
116
  {
117
+ "epoch": 0.13494992092778071,
118
+ "grad_norm": 0.03391977399587631,
119
+ "learning_rate": 5.978943288040551e-05,
120
+ "loss": 1.5932,
121
  "step": 16
122
  },
123
  {
124
+ "epoch": 0.143384290985767,
125
+ "grad_norm": 0.0362255796790123,
126
+ "learning_rate": 5.967120561966492e-05,
127
+ "loss": 1.5873,
128
  "step": 17
129
  },
130
  {
131
+ "epoch": 0.1518186610437533,
132
+ "grad_norm": 0.027403229847550392,
133
+ "learning_rate": 5.952691733842791e-05,
134
+ "loss": 1.5845,
135
  "step": 18
136
  },
137
  {
138
+ "epoch": 0.16025303110173958,
139
+ "grad_norm": 0.02821512520313263,
140
+ "learning_rate": 5.935669476898512e-05,
141
+ "loss": 1.5942,
142
  "step": 19
143
  },
144
  {
145
+ "epoch": 0.16868740115972589,
146
+ "grad_norm": 0.022913869470357895,
147
+ "learning_rate": 5.9160687422412324e-05,
148
+ "loss": 1.5976,
149
  "step": 20
150
  },
151
  {
152
+ "epoch": 0.17712177121771217,
153
+ "grad_norm": 0.02420000359416008,
154
+ "learning_rate": 5.893906745725076e-05,
155
+ "loss": 1.5862,
156
  "step": 21
157
  },
158
  {
159
+ "epoch": 0.18555614127569847,
160
+ "grad_norm": 0.021311871707439423,
161
+ "learning_rate": 5.8692029528295675e-05,
162
+ "loss": 1.5877,
163
  "step": 22
164
  },
165
  {
166
+ "epoch": 0.19399051133368478,
167
+ "grad_norm": 0.024183662608265877,
168
+ "learning_rate": 5.841979061562574e-05,
169
+ "loss": 1.584,
170
  "step": 23
171
  },
172
  {
173
+ "epoch": 0.20242488139167106,
174
+ "grad_norm": 0.02072131633758545,
175
+ "learning_rate": 5.8122589834023634e-05,
176
+ "loss": 1.5841,
177
  "step": 24
178
  },
179
  {
180
+ "epoch": 0.21085925144965736,
181
+ "grad_norm": 0.023273587226867676,
182
+ "learning_rate": 5.7800688222955e-05,
183
+ "loss": 1.5845,
184
  "step": 25
185
  },
186
  {
187
+ "epoch": 0.21929362150764364,
188
+ "grad_norm": 0.0180776659399271,
189
+ "learning_rate": 5.745436851729055e-05,
190
+ "loss": 1.594,
191
  "step": 26
192
  },
193
  {
194
+ "epoch": 0.22772799156562995,
195
+ "grad_norm": 0.018995055928826332,
196
+ "learning_rate": 5.708393489897231e-05,
197
+ "loss": 1.5903,
198
  "step": 27
199
  },
200
  {
201
+ "epoch": 0.23616236162361623,
202
+ "grad_norm": 0.017286648973822594,
203
+ "learning_rate": 5.668971272984242e-05,
204
+ "loss": 1.5804,
205
  "step": 28
206
  },
207
  {
208
+ "epoch": 0.24459673168160254,
209
+ "grad_norm": 0.018625088036060333,
210
+ "learning_rate": 5.6272048265869104e-05,
211
+ "loss": 1.5798,
212
  "step": 29
213
  },
214
  {
215
+ "epoch": 0.2530311017395888,
216
+ "grad_norm": 0.017109202221035957,
217
+ "learning_rate": 5.583130835302066e-05,
218
+ "loss": 1.5848,
219
  "step": 30
220
  },
221
  {
222
+ "epoch": 0.2614654717975751,
223
+ "grad_norm": 0.017000902444124222,
224
+ "learning_rate": 5.536788010505478e-05,
225
+ "loss": 1.5751,
226
  "step": 31
227
  },
228
  {
229
+ "epoch": 0.26989984185556143,
230
+ "grad_norm": 0.018897738307714462,
231
+ "learning_rate": 5.4882170563506055e-05,
232
+ "loss": 1.5799,
233
  "step": 32
234
  },
235
  {
236
+ "epoch": 0.2783342119135477,
237
+ "grad_norm": 0.017153726890683174,
238
+ "learning_rate": 5.437460634017044e-05,
239
+ "loss": 1.5758,
240
  "step": 33
241
  },
242
  {
243
+ "epoch": 0.286768581971534,
244
+ "grad_norm": 0.020006069913506508,
245
+ "learning_rate": 5.3845633242400604e-05,
246
+ "loss": 1.5774,
247
  "step": 34
248
  },
249
  {
250
+ "epoch": 0.2952029520295203,
251
+ "grad_norm": 0.016250574961304665,
252
+ "learning_rate": 5.329571588154127e-05,
253
+ "loss": 1.5748,
254
  "step": 35
255
  },
256
  {
257
+ "epoch": 0.3036373220875066,
258
+ "grad_norm": 0.019675249233841896,
259
+ "learning_rate": 5.2725337264848605e-05,
260
+ "loss": 1.5772,
261
  "step": 36
262
  },
263
  {
264
+ "epoch": 0.3120716921454929,
265
+ "grad_norm": 0.017005721107125282,
266
+ "learning_rate": 5.213499837125182e-05,
267
+ "loss": 1.5697,
268
  "step": 37
269
  },
270
  {
271
+ "epoch": 0.32050606220347916,
272
+ "grad_norm": 0.01664470136165619,
273
+ "learning_rate": 5.152521771132993e-05,
274
+ "loss": 1.5761,
275
  "step": 38
276
  },
277
  {
278
+ "epoch": 0.32894043226146547,
279
+ "grad_norm": 0.01764543540775776,
280
+ "learning_rate": 5.0896530871889914e-05,
281
+ "loss": 1.5793,
282
  "step": 39
283
  },
284
  {
285
+ "epoch": 0.33737480231945177,
286
+ "grad_norm": 0.016753442585468292,
287
+ "learning_rate": 5.024949004554632e-05,
288
+ "loss": 1.5658,
289
  "step": 40
290
  },
291
  {
292
+ "epoch": 0.3458091723774381,
293
+ "grad_norm": 0.019939422607421875,
294
+ "learning_rate": 4.958466354571565e-05,
295
+ "loss": 1.5762,
296
  "step": 41
297
  },
298
  {
299
+ "epoch": 0.35424354243542433,
300
+ "grad_norm": 0.01566561497747898,
301
+ "learning_rate": 4.890263530745134e-05,
302
+ "loss": 1.5703,
303
  "step": 42
304
  },
305
  {
306
+ "epoch": 0.36267791249341064,
307
+ "grad_norm": 0.015579808503389359,
308
+ "learning_rate": 4.8204004374557806e-05,
309
+ "loss": 1.577,
310
  "step": 43
311
  },
312
  {
313
+ "epoch": 0.37111228255139694,
314
+ "grad_norm": 0.016742996871471405,
315
+ "learning_rate": 4.748938437343416e-05,
316
+ "loss": 1.5726,
317
  "step": 44
318
  },
319
  {
320
+ "epoch": 0.37954665260938325,
321
+ "grad_norm": 0.017128925770521164,
322
+ "learning_rate": 4.675940297410958e-05,
323
+ "loss": 1.579,
324
  "step": 45
325
  },
326
  {
327
+ "epoch": 0.38798102266736956,
328
+ "grad_norm": 0.015266829170286655,
329
+ "learning_rate": 4.601470133894373e-05,
330
+ "loss": 1.5611,
331
  "step": 46
332
  },
333
  {
334
+ "epoch": 0.3964153927253558,
335
+ "grad_norm": 0.014922689646482468,
336
+ "learning_rate": 4.525593355947662e-05,
337
+ "loss": 1.5725,
338
  "step": 47
339
  },
340
  {
341
+ "epoch": 0.4048497627833421,
342
+ "grad_norm": 0.01651890017092228,
343
+ "learning_rate": 4.448376608192235e-05,
344
+ "loss": 1.5679,
345
  "step": 48
346
  },
347
  {
348
+ "epoch": 0.4132841328413284,
349
+ "grad_norm": 0.013002808205783367,
350
+ "learning_rate": 4.3698877121811395e-05,
351
+ "loss": 1.5712,
352
  "step": 49
353
  },
354
  {
355
+ "epoch": 0.42171850289931473,
356
+ "grad_norm": 0.013684232719242573,
357
+ "learning_rate": 4.290195606829562e-05,
358
+ "loss": 1.5683,
359
  "step": 50
360
  },
361
  {
362
+ "epoch": 0.430152872957301,
363
+ "grad_norm": 0.01470887940376997,
364
+ "learning_rate": 4.2093702878639174e-05,
365
+ "loss": 1.5784,
366
  "step": 51
367
  },
368
  {
369
+ "epoch": 0.4385872430152873,
370
+ "grad_norm": 0.013774153776466846,
371
+ "learning_rate": 4.127482746342714e-05,
372
+ "loss": 1.5648,
373
  "step": 52
374
  },
375
  {
376
+ "epoch": 0.4470216130732736,
377
+ "grad_norm": 0.01601037010550499,
378
+ "learning_rate": 4.044604906303197e-05,
379
+ "loss": 1.5671,
380
  "step": 53
381
  },
382
  {
383
+ "epoch": 0.4554559831312599,
384
+ "grad_norm": 0.013479109853506088,
385
+ "learning_rate": 3.960809561588513e-05,
386
+ "loss": 1.5759,
387
  "step": 54
388
  },
389
  {
390
+ "epoch": 0.46389035318924615,
391
+ "grad_norm": 0.01525378692895174,
392
+ "learning_rate": 3.876170311910928e-05,
393
+ "loss": 1.5672,
394
  "step": 55
395
  },
396
  {
397
+ "epoch": 0.47232472324723246,
398
+ "grad_norm": 0.013126607052981853,
399
+ "learning_rate": 3.790761498207203e-05,
400
+ "loss": 1.5744,
401
  "step": 56
402
  },
403
  {
404
+ "epoch": 0.48075909330521877,
405
+ "grad_norm": 0.013218970037996769,
406
+ "learning_rate": 3.704658137342952e-05,
407
+ "loss": 1.5688,
408
  "step": 57
409
  },
410
  {
411
+ "epoch": 0.48919346336320507,
412
+ "grad_norm": 0.014142030850052834,
413
+ "learning_rate": 3.617935856223295e-05,
414
+ "loss": 1.5742,
415
  "step": 58
416
  },
417
  {
418
+ "epoch": 0.4976278334211914,
419
+ "grad_norm": 0.013189482502639294,
420
+ "learning_rate": 3.5306708253677186e-05,
421
+ "loss": 1.5615,
422
  "step": 59
423
  },
424
  {
425
+ "epoch": 0.5060622034791776,
426
+ "grad_norm": 0.014055909588932991,
427
+ "learning_rate": 3.442939692007444e-05,
428
+ "loss": 1.5456,
429
  "step": 60
430
  },
431
  {
432
+ "epoch": 0.5144965735371639,
433
+ "grad_norm": 0.011999402195215225,
434
+ "learning_rate": 3.354819512764097e-05,
435
+ "loss": 1.5579,
436
  "step": 61
437
  },
438
  {
439
+ "epoch": 0.5229309435951502,
440
+ "grad_norm": 0.015170286409556866,
441
+ "learning_rate": 3.2663876859688045e-05,
442
+ "loss": 1.5606,
443
  "step": 62
444
  },
445
  {
446
+ "epoch": 0.5313653136531366,
447
+ "grad_norm": 0.013461374677717686,
448
+ "learning_rate": 3.177721883681143e-05,
449
+ "loss": 1.5631,
450
  "step": 63
451
  },
452
  {
453
+ "epoch": 0.5397996837111229,
454
+ "grad_norm": 0.014450161717832088,
455
+ "learning_rate": 3.0888999834676796e-05,
456
+ "loss": 1.5606,
457
  "step": 64
458
  },
459
  {
460
+ "epoch": 0.5482340537691092,
461
+ "grad_norm": 0.014033439569175243,
462
+ "learning_rate": 3e-05,
463
+ "loss": 1.5638,
464
  "step": 65
465
  },
466
  {
467
+ "epoch": 0.5566684238270954,
468
+ "grad_norm": 0.014029957354068756,
469
+ "learning_rate": 2.9111000165323206e-05,
470
+ "loss": 1.5656,
471
  "step": 66
472
  },
473
  {
474
+ "epoch": 0.5651027938850817,
475
+ "grad_norm": 0.016938265413045883,
476
+ "learning_rate": 2.8222781163188573e-05,
477
+ "loss": 1.5595,
478
  "step": 67
479
  },
480
  {
481
+ "epoch": 0.573537163943068,
482
+ "grad_norm": 0.014442404732108116,
483
+ "learning_rate": 2.7336123140311957e-05,
484
+ "loss": 1.5627,
485
  "step": 68
486
  },
487
  {
488
+ "epoch": 0.5819715340010543,
489
+ "grad_norm": 0.015609300695359707,
490
+ "learning_rate": 2.645180487235903e-05,
491
+ "loss": 1.5707,
492
  "step": 69
493
  },
494
  {
495
+ "epoch": 0.5904059040590406,
496
+ "grad_norm": 0.014037694782018661,
497
+ "learning_rate": 2.557060307992557e-05,
498
+ "loss": 1.5635,
499
  "step": 70
500
  },
501
  {
502
+ "epoch": 0.5988402741170269,
503
+ "grad_norm": 0.013035484589636326,
504
+ "learning_rate": 2.469329174632282e-05,
505
+ "loss": 1.5635,
506
  "step": 71
507
  },
508
  {
509
+ "epoch": 0.6072746441750132,
510
+ "grad_norm": 0.013149570673704147,
511
+ "learning_rate": 2.3820641437767053e-05,
512
+ "loss": 1.5607,
513
  "step": 72
514
  },
515
  {
516
+ "epoch": 0.6157090142329995,
517
+ "grad_norm": 0.01272524707019329,
518
+ "learning_rate": 2.2953418626570494e-05,
519
+ "loss": 1.5524,
520
  "step": 73
521
  },
522
  {
523
+ "epoch": 0.6241433842909858,
524
+ "grad_norm": 0.01219966635107994,
525
+ "learning_rate": 2.209238501792798e-05,
526
+ "loss": 1.555,
527
  "step": 74
528
  },
529
  {
530
+ "epoch": 0.632577754348972,
531
+ "grad_norm": 0.01229917537420988,
532
+ "learning_rate": 2.123829688089073e-05,
533
+ "loss": 1.5514,
534
  "step": 75
535
  },
536
  {
537
+ "epoch": 0.6410121244069583,
538
+ "grad_norm": 0.013784164562821388,
539
+ "learning_rate": 2.0391904384114877e-05,
540
+ "loss": 1.5614,
541
  "step": 76
542
  },
543
  {
544
+ "epoch": 0.6494464944649446,
545
+ "grad_norm": 0.010503321886062622,
546
+ "learning_rate": 1.9553950936968042e-05,
547
+ "loss": 1.541,
548
  "step": 77
549
  },
550
  {
551
+ "epoch": 0.6578808645229309,
552
+ "grad_norm": 0.012291346676647663,
553
+ "learning_rate": 1.8725172536572863e-05,
554
+ "loss": 1.556,
555
  "step": 78
556
  },
557
  {
558
+ "epoch": 0.6663152345809172,
559
+ "grad_norm": 0.011516911908984184,
560
+ "learning_rate": 1.7906297121360838e-05,
561
+ "loss": 1.5638,
562
  "step": 79
563
  },
564
  {
565
+ "epoch": 0.6747496046389035,
566
+ "grad_norm": 0.01181780081242323,
567
+ "learning_rate": 1.7098043931704396e-05,
568
+ "loss": 1.5508,
569
  "step": 80
570
  },
571
  {
572
+ "epoch": 0.6831839746968899,
573
+ "grad_norm": 0.010808738879859447,
574
+ "learning_rate": 1.6301122878188607e-05,
575
+ "loss": 1.5567,
576
  "step": 81
577
  },
578
  {
579
+ "epoch": 0.6916183447548762,
580
+ "grad_norm": 0.010649660602211952,
581
+ "learning_rate": 1.551623391807766e-05,
582
+ "loss": 1.5484,
583
  "step": 82
584
  },
585
  {
586
+ "epoch": 0.7000527148128625,
587
+ "grad_norm": 0.010580360889434814,
588
+ "learning_rate": 1.4744066440523391e-05,
589
+ "loss": 1.5591,
590
  "step": 83
591
  },
592
  {
593
+ "epoch": 0.7084870848708487,
594
+ "grad_norm": 0.010917909443378448,
595
+ "learning_rate": 1.3985298661056292e-05,
596
+ "loss": 1.569,
597
  "step": 84
598
  },
599
  {
600
+ "epoch": 0.716921454928835,
601
+ "grad_norm": 0.01177785824984312,
602
+ "learning_rate": 1.324059702589043e-05,
603
+ "loss": 1.5631,
604
  "step": 85
605
  },
606
  {
607
+ "epoch": 0.7253558249868213,
608
+ "grad_norm": 0.009857219643890858,
609
+ "learning_rate": 1.2510615626565844e-05,
610
+ "loss": 1.5561,
611
  "step": 86
612
  },
613
  {
614
+ "epoch": 0.7337901950448076,
615
+ "grad_norm": 0.011106839403510094,
616
+ "learning_rate": 1.1795995625442208e-05,
617
+ "loss": 1.5471,
618
  "step": 87
619
  },
620
  {
621
+ "epoch": 0.7422245651027939,
622
+ "grad_norm": 0.011377968825399876,
623
+ "learning_rate": 1.109736469254867e-05,
624
+ "loss": 1.5583,
625
  "step": 88
626
  },
627
  {
628
+ "epoch": 0.7506589351607802,
629
+ "grad_norm": 0.010118059813976288,
630
+ "learning_rate": 1.0415336454284356e-05,
631
+ "loss": 1.5531,
632
  "step": 89
633
  },
634
  {
635
+ "epoch": 0.7590933052187665,
636
+ "grad_norm": 0.01021275483071804,
637
+ "learning_rate": 9.75050995445369e-06,
638
+ "loss": 1.5559,
639
  "step": 90
640
  },
641
  {
642
+ "epoch": 0.7675276752767528,
643
+ "grad_norm": 0.00994526594877243,
644
+ "learning_rate": 9.103469128110098e-06,
645
+ "loss": 1.5527,
646
  "step": 91
647
  },
648
  {
649
+ "epoch": 0.7759620453347391,
650
+ "grad_norm": 0.01060432381927967,
651
+ "learning_rate": 8.474782288670058e-06,
652
+ "loss": 1.5514,
653
  "step": 92
654
  },
655
  {
656
+ "epoch": 0.7843964153927253,
657
+ "grad_norm": 0.011965557001531124,
658
+ "learning_rate": 7.86500162874818e-06,
659
+ "loss": 1.5536,
660
  "step": 93
661
  },
662
  {
663
+ "epoch": 0.7928307854507116,
664
+ "grad_norm": 0.010221057571470737,
665
+ "learning_rate": 7.274662735151396e-06,
666
+ "loss": 1.5541,
667
  "step": 94
668
  },
669
  {
670
+ "epoch": 0.8012651555086979,
671
+ "grad_norm": 0.01093184296041727,
672
+ "learning_rate": 6.704284118458731e-06,
673
+ "loss": 1.5512,
674
  "step": 95
675
  },
676
  {
677
+ "epoch": 0.8096995255666842,
678
+ "grad_norm": 0.010998157784342766,
679
+ "learning_rate": 6.154366757599399e-06,
680
+ "loss": 1.5492,
681
  "step": 96
682
  },
683
  {
684
+ "epoch": 0.8181338956246705,
685
+ "grad_norm": 0.01003272831439972,
686
+ "learning_rate": 5.625393659829561e-06,
687
+ "loss": 1.5472,
688
  "step": 97
689
  },
690
  {
691
+ "epoch": 0.8265682656826568,
692
+ "grad_norm": 0.010513346642255783,
693
+ "learning_rate": 5.117829436493947e-06,
694
+ "loss": 1.551,
695
  "step": 98
696
  },
697
  {
698
+ "epoch": 0.8350026357406432,
699
+ "grad_norm": 0.01016693189740181,
700
+ "learning_rate": 4.632119894945215e-06,
701
+ "loss": 1.5599,
702
  "step": 99
703
  },
704
  {
705
+ "epoch": 0.8434370057986295,
706
+ "grad_norm": 0.009756877087056637,
707
+ "learning_rate": 4.1686916469793335e-06,
708
+ "loss": 1.5552,
709
  "step": 100
710
  },
711
  {
712
+ "epoch": 0.8518713758566157,
713
+ "grad_norm": 0.010328919626772404,
714
+ "learning_rate": 3.7279517341308977e-06,
715
+ "loss": 1.5645,
716
  "step": 101
717
  },
718
  {
719
+ "epoch": 0.860305745914602,
720
+ "grad_norm": 0.009724525734782219,
721
+ "learning_rate": 3.3102872701575838e-06,
722
+ "loss": 1.5466,
723
  "step": 102
724
  },
725
  {
726
+ "epoch": 0.8687401159725883,
727
+ "grad_norm": 0.009452255442738533,
728
+ "learning_rate": 2.916065101027694e-06,
729
+ "loss": 1.555,
730
  "step": 103
731
  },
732
  {
733
+ "epoch": 0.8771744860305746,
734
+ "grad_norm": 0.009558911435306072,
735
+ "learning_rate": 2.5456314827094463e-06,
736
+ "loss": 1.5479,
737
  "step": 104
738
  },
739
  {
740
+ "epoch": 0.8856088560885609,
741
+ "grad_norm": 0.009129817597568035,
742
+ "learning_rate": 2.1993117770449987e-06,
743
+ "loss": 1.545,
744
  "step": 105
745
  },
746
  {
747
+ "epoch": 0.8940432261465472,
748
+ "grad_norm": 0.00930058490484953,
749
+ "learning_rate": 1.8774101659763731e-06,
750
+ "loss": 1.554,
751
  "step": 106
752
  },
753
  {
754
+ "epoch": 0.9024775962045335,
755
+ "grad_norm": 0.009718949906527996,
756
+ "learning_rate": 1.5802093843742582e-06,
757
+ "loss": 1.5467,
758
  "step": 107
759
  },
760
  {
761
+ "epoch": 0.9109119662625198,
762
+ "grad_norm": 0.009196877479553223,
763
+ "learning_rate": 1.3079704717043273e-06,
764
+ "loss": 1.55,
765
  "step": 108
766
  },
767
  {
768
+ "epoch": 0.9193463363205061,
769
+ "grad_norm": 0.00919976457953453,
770
+ "learning_rate": 1.060932542749241e-06,
771
+ "loss": 1.5558,
772
  "step": 109
773
  },
774
  {
775
+ "epoch": 0.9277807063784923,
776
+ "grad_norm": 0.0089542455971241,
777
+ "learning_rate": 8.393125775876775e-07,
778
+ "loss": 1.5563,
779
  "step": 110
780
  },
781
  {
782
+ "epoch": 0.9362150764364786,
783
+ "grad_norm": 0.009196256287395954,
784
+ "learning_rate": 6.433052310148791e-07,
785
+ "loss": 1.5537,
786
  "step": 111
787
  },
788
  {
789
+ "epoch": 0.9446494464944649,
790
+ "grad_norm": 0.009201628156006336,
791
+ "learning_rate": 4.730826615720951e-07,
792
+ "loss": 1.5567,
793
  "step": 112
794
  },
795
  {
796
+ "epoch": 0.9530838165524512,
797
+ "grad_norm": 0.008883966132998466,
798
+ "learning_rate": 3.28794380335079e-07,
799
+ "loss": 1.5549,
800
  "step": 113
801
  },
802
  {
803
+ "epoch": 0.9615181866104375,
804
+ "grad_norm": 0.009221088141202927,
805
+ "learning_rate": 2.1056711959449247e-07,
806
+ "loss": 1.5585,
807
  "step": 114
808
  },
809
  {
810
+ "epoch": 0.9699525566684238,
811
+ "grad_norm": 0.009092201478779316,
812
+ "learning_rate": 1.1850472154349313e-07,
813
+ "loss": 1.5536,
814
  "step": 115
815
  },
816
  {
817
+ "epoch": 0.9783869267264101,
818
+ "grad_norm": 0.009470025077462196,
819
+ "learning_rate": 5.268804707035946e-08,
820
+ "loss": 1.5705,
821
  "step": 116
822
  },
823
  {
824
+ "epoch": 0.9868212967843965,
825
+ "grad_norm": 0.008715336211025715,
826
+ "learning_rate": 1.3174904736169557e-08,
827
+ "loss": 1.5566,
828
  "step": 117
829
  },
830
  {
831
+ "epoch": 0.9952556668423828,
832
+ "grad_norm": 0.008857190608978271,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
833
  "learning_rate": 0.0,
834
+ "loss": 1.5464,
835
+ "step": 118
836
  },
837
  {
838
+ "epoch": 0.9952556668423828,
839
+ "step": 118,
840
+ "total_flos": 1660937136242688.0,
841
+ "train_loss": 1.571211524939133,
842
+ "train_runtime": 47361.2024,
843
+ "train_samples_per_second": 0.641,
844
+ "train_steps_per_second": 0.002
845
  }
846
  ],
847
  "logging_steps": 1,
848
+ "max_steps": 118,
849
  "num_input_tokens_seen": 0,
850
  "num_train_epochs": 1,
851
  "save_steps": 100,
 
861
  "attributes": {}
862
  }
863
  },
864
+ "total_flos": 1660937136242688.0,
865
+ "train_batch_size": 2,
866
  "trial_name": null,
867
  "trial_params": null
868
  }
training_loss.png CHANGED