vantaa32 commited on
Commit
3bd7913
·
verified ·
1 Parent(s): 92ca991

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -9,15 +9,15 @@
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "modules_to_save": null,
12
- "n_frequency": 100000,
13
  "n_frequency_pattern": {},
14
  "peft_type": "FOURIERFT",
15
  "random_loc_seed": 777,
16
  "revision": null,
17
- "scaling": 512.0,
18
  "target_modules": [
19
- "v_proj",
20
- "q_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
 
9
  "layers_pattern": null,
10
  "layers_to_transform": null,
11
  "modules_to_save": null,
12
+ "n_frequency": 1000,
13
  "n_frequency_pattern": {},
14
  "peft_type": "FOURIERFT",
15
  "random_loc_seed": 777,
16
  "revision": null,
17
+ "scaling": 300.0,
18
  "target_modules": [
19
+ "q_proj",
20
+ "v_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e4fbb0015872578b2c8b2b7fe2c79c1e7f37bb2418b2364f89ecf07dbf522ed
3
- size 25608864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4dccb0c0555ab8492aa0633409c9039f47c0f65dcacdaaf2a1d45e8b8334cb37
3
+ size 264480
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2f8fb2fe3b7a1f45e045e1abe8eb69243208b5ab9193023a6530575e206b3d01
3
- size 51254010
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03d541d47b76e3bb8e3f55a8abd2aa7f6078e58d48eaa9b18a4ecef64a6fbcb1
3
+ size 561402
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3e8f091fd805b9903d345d99c13640ce0c04978aa8df1b0b259f57dcc1650d70
3
  size 14180
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e9db1c36a3fe626194b8016f36409bad40a8f19f4c1ea5186e4318edad327f17
3
  size 14180
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f6a8ea1b5c47dca8d6e3455f5f85a613048d99290bddfe4776319de664eba2f
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af6df9be5fec90cdc67071cd0bc07ba292c19bae637d054dfd010deb1ccf035e
3
  size 1064
tokenizer_config.json CHANGED
@@ -33,7 +33,7 @@
33
  "eos_token": "</s>",
34
  "extra_special_tokens": {},
35
  "legacy": false,
36
- "model_max_length": 2048,
37
  "pad_token": "<unk>",
38
  "padding_side": "right",
39
  "sp_model_kwargs": {},
 
33
  "eos_token": "</s>",
34
  "extra_special_tokens": {},
35
  "legacy": false,
36
+ "model_max_length": 512,
37
  "pad_token": "<unk>",
38
  "padding_side": "right",
39
  "sp_model_kwargs": {},
trainer_state.json CHANGED
@@ -1,369 +1,384 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7727975270479135,
5
- "eval_steps": 500,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.015455950540958269,
13
- "grad_norm": 0.357046514749527,
14
- "learning_rate": 0.0003846153846153846,
15
- "loss": 1.5383,
16
- "step": 10
17
  },
18
  {
19
- "epoch": 0.030911901081916538,
20
- "grad_norm": 0.11198900640010834,
21
- "learning_rate": 0.0007692307692307692,
22
- "loss": 1.1091,
23
- "step": 20
24
  },
25
  {
26
- "epoch": 0.04636785162287481,
27
- "grad_norm": 0.056582603603601456,
28
- "learning_rate": 0.001153846153846154,
29
- "loss": 0.7939,
30
- "step": 30
31
  },
32
  {
33
- "epoch": 0.061823802163833076,
34
- "grad_norm": 0.0210476852953434,
35
- "learning_rate": 0.0015384615384615385,
36
- "loss": 0.6165,
37
- "step": 40
38
  },
39
  {
40
- "epoch": 0.07727975270479134,
41
- "grad_norm": 0.012083015404641628,
42
- "learning_rate": 0.0019230769230769232,
43
- "loss": 0.5624,
44
- "step": 50
45
  },
46
  {
47
- "epoch": 0.09273570324574962,
48
- "grad_norm": 0.008505144156515598,
49
- "learning_rate": 0.002307692307692308,
50
- "loss": 0.5269,
51
- "step": 60
52
  },
53
  {
54
- "epoch": 0.10819165378670788,
55
- "grad_norm": 0.0063809980638325214,
56
- "learning_rate": 0.0026923076923076926,
57
- "loss": 0.5059,
58
- "step": 70
59
  },
60
  {
61
- "epoch": 0.12364760432766615,
62
- "grad_norm": 0.005832094699144363,
63
- "learning_rate": 0.0029950657894736842,
64
- "loss": 0.5202,
65
- "step": 80
66
  },
67
  {
68
- "epoch": 0.1391035548686244,
69
- "grad_norm": 0.004662094172090292,
70
- "learning_rate": 0.0029703947368421055,
71
- "loss": 0.5087,
72
- "step": 90
73
  },
74
  {
75
- "epoch": 0.1545595054095827,
76
- "grad_norm": 0.004813206382095814,
77
- "learning_rate": 0.0029457236842105267,
78
- "loss": 0.4979,
79
- "step": 100
80
  },
81
  {
82
- "epoch": 0.17001545595054096,
83
- "grad_norm": 0.003981301095336676,
84
- "learning_rate": 0.0029210526315789475,
85
- "loss": 0.4833,
86
- "step": 110
87
  },
88
  {
89
- "epoch": 0.18547140649149924,
90
- "grad_norm": 0.0037942214403301477,
91
- "learning_rate": 0.0028963815789473687,
92
- "loss": 0.4842,
93
- "step": 120
94
  },
95
  {
96
- "epoch": 0.2009273570324575,
97
- "grad_norm": 0.0041742450557649136,
98
- "learning_rate": 0.0028717105263157895,
99
- "loss": 0.4818,
100
- "step": 130
101
  },
102
  {
103
- "epoch": 0.21638330757341576,
104
- "grad_norm": 0.005099099595099688,
105
- "learning_rate": 0.0028470394736842108,
106
- "loss": 0.4809,
107
- "step": 140
108
  },
109
  {
110
- "epoch": 0.23183925811437403,
111
- "grad_norm": 0.0031047100201249123,
112
- "learning_rate": 0.0028223684210526316,
113
- "loss": 0.5016,
114
- "step": 150
115
  },
116
  {
117
- "epoch": 0.2472952086553323,
118
- "grad_norm": 0.0036040199920535088,
119
- "learning_rate": 0.002797697368421053,
120
- "loss": 0.4775,
121
- "step": 160
122
  },
123
  {
124
- "epoch": 0.26275115919629055,
125
- "grad_norm": 0.0033861789852380753,
126
- "learning_rate": 0.0027730263157894736,
127
- "loss": 0.4784,
128
- "step": 170
129
  },
130
  {
131
- "epoch": 0.2782071097372488,
132
- "grad_norm": 0.003118926426395774,
133
- "learning_rate": 0.002748355263157895,
134
- "loss": 0.4962,
135
- "step": 180
136
  },
137
  {
138
- "epoch": 0.2936630602782071,
139
- "grad_norm": 0.0035265563055872917,
140
- "learning_rate": 0.002723684210526316,
141
- "loss": 0.4829,
142
- "step": 190
143
  },
144
  {
145
- "epoch": 0.3091190108191654,
146
- "grad_norm": 0.0035475995391607285,
147
- "learning_rate": 0.002699013157894737,
148
- "loss": 0.485,
149
- "step": 200
150
  },
151
  {
152
- "epoch": 0.32457496136012365,
153
- "grad_norm": 0.0030264686793088913,
154
- "learning_rate": 0.002674342105263158,
155
- "loss": 0.4681,
156
- "step": 210
157
  },
158
  {
159
- "epoch": 0.3400309119010819,
160
- "grad_norm": 0.0033854299690574408,
161
- "learning_rate": 0.002649671052631579,
162
- "loss": 0.4805,
163
- "step": 220
164
  },
165
  {
166
- "epoch": 0.3554868624420402,
167
- "grad_norm": 0.0029569112230092287,
168
- "learning_rate": 0.002625,
169
- "loss": 0.4688,
170
- "step": 230
171
  },
172
  {
173
- "epoch": 0.37094281298299847,
174
- "grad_norm": 0.0032272525131702423,
175
- "learning_rate": 0.002600328947368421,
176
- "loss": 0.4752,
177
- "step": 240
178
  },
179
  {
180
- "epoch": 0.38639876352395675,
181
- "grad_norm": 0.003502602456137538,
182
- "learning_rate": 0.002575657894736842,
183
- "loss": 0.4699,
184
- "step": 250
185
  },
186
  {
187
- "epoch": 0.401854714064915,
188
- "grad_norm": 0.0031522298231720924,
189
- "learning_rate": 0.002550986842105263,
190
- "loss": 0.4756,
191
- "step": 260
192
  },
193
  {
194
- "epoch": 0.41731066460587324,
195
- "grad_norm": 0.003098264569416642,
196
- "learning_rate": 0.0025263157894736842,
197
- "loss": 0.4574,
198
- "step": 270
199
  },
200
  {
201
- "epoch": 0.4327666151468315,
202
- "grad_norm": 0.0025676521472632885,
203
- "learning_rate": 0.0025016447368421055,
204
- "loss": 0.4779,
205
- "step": 280
206
  },
207
  {
208
- "epoch": 0.4482225656877898,
209
- "grad_norm": 0.0034302272833883762,
210
- "learning_rate": 0.0024769736842105263,
211
- "loss": 0.4729,
212
- "step": 290
213
  },
214
  {
215
- "epoch": 0.46367851622874806,
216
- "grad_norm": 0.003159865504130721,
217
- "learning_rate": 0.0024523026315789475,
218
- "loss": 0.4715,
219
- "step": 300
220
  },
221
  {
222
- "epoch": 0.47913446676970634,
223
- "grad_norm": 0.003168923780322075,
224
- "learning_rate": 0.0024276315789473683,
225
- "loss": 0.4764,
226
- "step": 310
227
  },
228
  {
229
- "epoch": 0.4945904173106646,
230
- "grad_norm": 0.0034859515726566315,
231
- "learning_rate": 0.0024029605263157896,
232
- "loss": 0.4652,
233
- "step": 320
234
  },
235
  {
236
- "epoch": 0.5100463678516228,
237
- "grad_norm": 0.003067239187657833,
238
- "learning_rate": 0.0023782894736842104,
239
- "loss": 0.4648,
240
- "step": 330
241
  },
242
  {
243
- "epoch": 0.5255023183925811,
244
- "grad_norm": 0.0032223982270807028,
245
- "learning_rate": 0.0023536184210526316,
246
- "loss": 0.4725,
247
- "step": 340
248
  },
249
  {
250
- "epoch": 0.5409582689335394,
251
- "grad_norm": 0.0027090355288237333,
252
- "learning_rate": 0.0023289473684210524,
253
- "loss": 0.4704,
254
- "step": 350
 
 
 
 
 
 
 
255
  },
256
  {
257
- "epoch": 0.5564142194744977,
258
- "grad_norm": 0.003484300570562482,
259
- "learning_rate": 0.0023042763157894736,
260
- "loss": 0.4616,
261
- "step": 360
262
  },
263
  {
264
- "epoch": 0.5718701700154559,
265
- "grad_norm": 0.003339330432936549,
266
- "learning_rate": 0.0022796052631578944,
267
- "loss": 0.4665,
268
- "step": 370
269
  },
270
  {
271
- "epoch": 0.5873261205564142,
272
- "grad_norm": 0.0029797593597322702,
273
- "learning_rate": 0.002254934210526316,
274
- "loss": 0.4573,
275
- "step": 380
276
  },
277
  {
278
- "epoch": 0.6027820710973725,
279
- "grad_norm": 0.0030033981893211603,
280
- "learning_rate": 0.002230263157894737,
281
- "loss": 0.4618,
282
- "step": 390
283
  },
284
  {
285
- "epoch": 0.6182380216383307,
286
- "grad_norm": 0.005113155115395784,
287
- "learning_rate": 0.002205592105263158,
288
- "loss": 0.4589,
289
- "step": 400
290
  },
291
  {
292
- "epoch": 0.633693972179289,
293
- "grad_norm": 0.002975397277623415,
294
- "learning_rate": 0.002180921052631579,
295
- "loss": 0.4765,
296
- "step": 410
297
  },
298
  {
299
- "epoch": 0.6491499227202473,
300
- "grad_norm": 0.004753004759550095,
301
- "learning_rate": 0.00215625,
302
- "loss": 0.4631,
303
- "step": 420
304
  },
305
  {
306
- "epoch": 0.6646058732612056,
307
- "grad_norm": 0.003564928425475955,
308
- "learning_rate": 0.002131578947368421,
309
- "loss": 0.4488,
310
- "step": 430
311
  },
312
  {
313
- "epoch": 0.6800618238021638,
314
- "grad_norm": 0.0032665496692061424,
315
- "learning_rate": 0.0021069078947368422,
316
- "loss": 0.457,
317
- "step": 440
318
  },
319
  {
320
- "epoch": 0.6955177743431221,
321
- "grad_norm": 0.0030079709831625223,
322
- "learning_rate": 0.002082236842105263,
323
- "loss": 0.4667,
324
- "step": 450
325
  },
326
  {
327
- "epoch": 0.7109737248840804,
328
- "grad_norm": 0.0025733078364282846,
329
- "learning_rate": 0.0020575657894736843,
330
- "loss": 0.4667,
331
- "step": 460
332
  },
333
  {
334
- "epoch": 0.7264296754250387,
335
- "grad_norm": 0.00270587345585227,
336
- "learning_rate": 0.0020328947368421055,
337
- "loss": 0.4679,
338
- "step": 470
339
  },
340
  {
341
- "epoch": 0.7418856259659969,
342
- "grad_norm": 0.00273908581584692,
343
- "learning_rate": 0.0020082236842105263,
344
- "loss": 0.4694,
345
- "step": 480
346
  },
347
  {
348
- "epoch": 0.7573415765069552,
349
- "grad_norm": 0.002720112446695566,
350
- "learning_rate": 0.0019835526315789475,
351
- "loss": 0.4513,
352
- "step": 490
353
  },
354
  {
355
- "epoch": 0.7727975270479135,
356
- "grad_norm": 0.0028910296969115734,
357
- "learning_rate": 0.0019588815789473683,
358
- "loss": 0.4592,
359
- "step": 500
 
 
 
 
 
 
 
 
360
  }
361
  ],
362
- "logging_steps": 10,
363
- "max_steps": 1294,
364
  "num_input_tokens_seen": 0,
365
- "num_train_epochs": 2,
366
- "save_steps": 500,
367
  "stateful_callbacks": {
368
  "TrainerControl": {
369
  "args": {
@@ -371,12 +386,12 @@
371
  "should_evaluate": false,
372
  "should_log": false,
373
  "should_save": true,
374
- "should_training_stop": false
375
  },
376
  "attributes": {}
377
  }
378
  },
379
- "total_flos": 5.1151645505224704e+17,
380
  "train_batch_size": 4,
381
  "trial_name": null,
382
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.0,
5
+ "eval_steps": 5,
6
+ "global_step": 2588,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.019319938176197836,
13
+ "grad_norm": 0.00098650180734694,
14
+ "learning_rate": 0.009615384615384616,
15
+ "loss": 0.9907,
16
+ "step": 50
17
  },
18
  {
19
+ "epoch": 0.03863987635239567,
20
+ "grad_norm": 0.000779022928327322,
21
+ "learning_rate": 0.019230769230769232,
22
+ "loss": 0.9647,
23
+ "step": 100
24
  },
25
  {
26
+ "epoch": 0.05795981452859351,
27
+ "grad_norm": 0.000611725845374167,
28
+ "learning_rate": 0.028846153846153844,
29
+ "loss": 0.9412,
30
+ "step": 150
31
  },
32
  {
33
+ "epoch": 0.07727975270479134,
34
+ "grad_norm": 0.0005838441429659724,
35
+ "learning_rate": 0.029457236842105262,
36
+ "loss": 0.9322,
37
+ "step": 200
38
  },
39
  {
40
+ "epoch": 0.09659969088098919,
41
+ "grad_norm": 0.0007691067876294255,
42
+ "learning_rate": 0.028840460526315788,
43
+ "loss": 0.9131,
44
+ "step": 250
45
  },
46
  {
47
+ "epoch": 0.11591962905718702,
48
+ "grad_norm": 0.0005935626104474068,
49
+ "learning_rate": 0.028223684210526314,
50
+ "loss": 0.9104,
51
+ "step": 300
52
  },
53
  {
54
+ "epoch": 0.13523956723338484,
55
+ "grad_norm": 0.0006890599033795297,
56
+ "learning_rate": 0.02760690789473684,
57
+ "loss": 0.9214,
58
+ "step": 350
59
  },
60
  {
61
+ "epoch": 0.1545595054095827,
62
+ "grad_norm": 0.0006042916793376207,
63
+ "learning_rate": 0.02699013157894737,
64
+ "loss": 0.9,
65
+ "step": 400
66
  },
67
  {
68
+ "epoch": 0.17387944358578053,
69
+ "grad_norm": 0.0005447549629025161,
70
+ "learning_rate": 0.026373355263157892,
71
+ "loss": 0.9097,
72
+ "step": 450
73
  },
74
  {
75
+ "epoch": 0.19319938176197837,
76
+ "grad_norm": 0.0004888740368187428,
77
+ "learning_rate": 0.02575657894736842,
78
+ "loss": 0.9037,
79
+ "step": 500
80
  },
81
  {
82
+ "epoch": 0.2125193199381762,
83
+ "grad_norm": 0.0008238813607022166,
84
+ "learning_rate": 0.025139802631578945,
85
+ "loss": 0.899,
86
+ "step": 550
87
  },
88
  {
89
+ "epoch": 0.23183925811437403,
90
+ "grad_norm": 0.000727724633179605,
91
+ "learning_rate": 0.024523026315789474,
92
+ "loss": 0.923,
93
+ "step": 600
94
  },
95
  {
96
+ "epoch": 0.2511591962905719,
97
+ "grad_norm": 0.0005605846527032554,
98
+ "learning_rate": 0.02390625,
99
+ "loss": 0.9031,
100
+ "step": 650
101
  },
102
  {
103
+ "epoch": 0.2704791344667697,
104
+ "grad_norm": 0.0007705381722189486,
105
+ "learning_rate": 0.023289473684210523,
106
+ "loss": 0.9013,
107
+ "step": 700
108
  },
109
  {
110
+ "epoch": 0.28979907264296756,
111
+ "grad_norm": 0.0007164838025346398,
112
+ "learning_rate": 0.022672697368421053,
113
+ "loss": 0.8971,
114
+ "step": 750
115
  },
116
  {
117
+ "epoch": 0.3091190108191654,
118
+ "grad_norm": 0.000717374321538955,
119
+ "learning_rate": 0.02205592105263158,
120
+ "loss": 0.8866,
121
+ "step": 800
122
  },
123
  {
124
+ "epoch": 0.3284389489953632,
125
+ "grad_norm": 0.0006394012016244233,
126
+ "learning_rate": 0.021439144736842105,
127
+ "loss": 0.899,
128
+ "step": 850
129
  },
130
  {
131
+ "epoch": 0.34775888717156106,
132
+ "grad_norm": 0.0006252205348573625,
133
+ "learning_rate": 0.02082236842105263,
134
+ "loss": 0.894,
135
+ "step": 900
136
  },
137
  {
138
+ "epoch": 0.3670788253477589,
139
+ "grad_norm": 0.0006903470493853092,
140
+ "learning_rate": 0.020205592105263157,
141
+ "loss": 0.8858,
142
+ "step": 950
143
  },
144
  {
145
+ "epoch": 0.38639876352395675,
146
+ "grad_norm": 0.0008341589127667248,
147
+ "learning_rate": 0.019588815789473683,
148
+ "loss": 0.9168,
149
+ "step": 1000
150
  },
151
  {
152
+ "epoch": 0.40571870170015456,
153
+ "grad_norm": 0.0005771280848421156,
154
+ "learning_rate": 0.01897203947368421,
155
+ "loss": 0.9117,
156
+ "step": 1050
157
  },
158
  {
159
+ "epoch": 0.4250386398763524,
160
+ "grad_norm": 0.000522978079970926,
161
+ "learning_rate": 0.018355263157894736,
162
+ "loss": 0.8939,
163
+ "step": 1100
164
  },
165
  {
166
+ "epoch": 0.44435857805255025,
167
+ "grad_norm": 0.0005450574099086225,
168
+ "learning_rate": 0.017738486842105265,
169
+ "loss": 0.9049,
170
+ "step": 1150
171
  },
172
  {
173
+ "epoch": 0.46367851622874806,
174
+ "grad_norm": 0.0005660468013957143,
175
+ "learning_rate": 0.017121710526315788,
176
+ "loss": 0.8944,
177
+ "step": 1200
178
  },
179
  {
180
+ "epoch": 0.48299845440494593,
181
+ "grad_norm": 0.0006663696258328855,
182
+ "learning_rate": 0.016504934210526314,
183
+ "loss": 0.8971,
184
+ "step": 1250
185
  },
186
  {
187
+ "epoch": 0.5023183925811437,
188
+ "grad_norm": 0.0005968479672446847,
189
+ "learning_rate": 0.01588815789473684,
190
+ "loss": 0.8917,
191
+ "step": 1300
192
  },
193
  {
194
+ "epoch": 0.5216383307573416,
195
+ "grad_norm": 0.0007491153082810342,
196
+ "learning_rate": 0.01527138157894737,
197
+ "loss": 0.8829,
198
+ "step": 1350
199
  },
200
  {
201
+ "epoch": 0.5409582689335394,
202
+ "grad_norm": 0.0006275599589571357,
203
+ "learning_rate": 0.014654605263157894,
204
+ "loss": 0.9058,
205
+ "step": 1400
206
  },
207
  {
208
+ "epoch": 0.5602782071097373,
209
+ "grad_norm": 0.0007617810624651611,
210
+ "learning_rate": 0.01403782894736842,
211
+ "loss": 0.9051,
212
+ "step": 1450
213
  },
214
  {
215
+ "epoch": 0.5795981452859351,
216
+ "grad_norm": 0.0006214394234120846,
217
+ "learning_rate": 0.013421052631578946,
218
+ "loss": 0.8879,
219
+ "step": 1500
220
  },
221
  {
222
+ "epoch": 0.5989180834621329,
223
+ "grad_norm": 0.0006560624460689723,
224
+ "learning_rate": 0.012804276315789473,
225
+ "loss": 0.8991,
226
+ "step": 1550
227
  },
228
  {
229
+ "epoch": 0.6182380216383307,
230
+ "grad_norm": 0.0007683933363296092,
231
+ "learning_rate": 0.0121875,
232
+ "loss": 0.9081,
233
+ "step": 1600
234
  },
235
  {
236
+ "epoch": 0.6375579598145286,
237
+ "grad_norm": 0.0005783849046565592,
238
+ "learning_rate": 0.011570723684210527,
239
+ "loss": 0.9067,
240
+ "step": 1650
241
  },
242
  {
243
+ "epoch": 0.6568778979907264,
244
+ "grad_norm": 0.0007958198548294604,
245
+ "learning_rate": 0.010953947368421053,
246
+ "loss": 0.885,
247
+ "step": 1700
248
  },
249
  {
250
+ "epoch": 0.6761978361669243,
251
+ "grad_norm": 0.0006095783319324255,
252
+ "learning_rate": 0.010337171052631579,
253
+ "loss": 0.8928,
254
+ "step": 1750
255
+ },
256
+ {
257
+ "epoch": 0.6955177743431221,
258
+ "grad_norm": 0.000699816329870373,
259
+ "learning_rate": 0.009720394736842105,
260
+ "loss": 0.903,
261
+ "step": 1800
262
  },
263
  {
264
+ "epoch": 0.7148377125193199,
265
+ "grad_norm": 0.0008128538611344993,
266
+ "learning_rate": 0.009103618421052631,
267
+ "loss": 0.9036,
268
+ "step": 1850
269
  },
270
  {
271
+ "epoch": 0.7341576506955177,
272
+ "grad_norm": 0.0006495247362181544,
273
+ "learning_rate": 0.008486842105263157,
274
+ "loss": 0.8907,
275
+ "step": 1900
276
  },
277
  {
278
+ "epoch": 0.7534775888717156,
279
+ "grad_norm": 0.0005265743238851428,
280
+ "learning_rate": 0.007870065789473685,
281
+ "loss": 0.8843,
282
+ "step": 1950
283
  },
284
  {
285
+ "epoch": 0.7727975270479135,
286
+ "grad_norm": 0.0006601494387723505,
287
+ "learning_rate": 0.0072532894736842095,
288
+ "loss": 0.8925,
289
+ "step": 2000
290
  },
291
  {
292
+ "epoch": 0.7921174652241113,
293
+ "grad_norm": 0.0005823367391712964,
294
+ "learning_rate": 0.0066365131578947365,
295
+ "loss": 0.8954,
296
+ "step": 2050
297
  },
298
  {
299
+ "epoch": 0.8114374034003091,
300
+ "grad_norm": 0.0005229181842878461,
301
+ "learning_rate": 0.0060197368421052635,
302
+ "loss": 0.903,
303
+ "step": 2100
304
  },
305
  {
306
+ "epoch": 0.8307573415765069,
307
+ "grad_norm": 0.0005145368631929159,
308
+ "learning_rate": 0.00540296052631579,
309
+ "loss": 0.8923,
310
+ "step": 2150
311
  },
312
  {
313
+ "epoch": 0.8500772797527048,
314
+ "grad_norm": 0.0006071292445994914,
315
+ "learning_rate": 0.004786184210526316,
316
+ "loss": 0.8804,
317
+ "step": 2200
318
  },
319
  {
320
+ "epoch": 0.8693972179289027,
321
+ "grad_norm": 0.0006730407476425171,
322
+ "learning_rate": 0.004169407894736842,
323
+ "loss": 0.8919,
324
+ "step": 2250
325
  },
326
  {
327
+ "epoch": 0.8887171561051005,
328
+ "grad_norm": 0.0006455178954638541,
329
+ "learning_rate": 0.003552631578947368,
330
+ "loss": 0.896,
331
+ "step": 2300
332
  },
333
  {
334
+ "epoch": 0.9080370942812983,
335
+ "grad_norm": 0.0004997382056899369,
336
+ "learning_rate": 0.002935855263157895,
337
+ "loss": 0.8921,
338
+ "step": 2350
339
  },
340
  {
341
+ "epoch": 0.9273570324574961,
342
+ "grad_norm": 0.00045192165998741984,
343
+ "learning_rate": 0.002319078947368421,
344
+ "loss": 0.8839,
345
+ "step": 2400
346
  },
347
  {
348
+ "epoch": 0.9466769706336939,
349
+ "grad_norm": 0.0004822098126169294,
350
+ "learning_rate": 0.0017023026315789475,
351
+ "loss": 0.8988,
352
+ "step": 2450
353
  },
354
  {
355
+ "epoch": 0.9659969088098919,
356
+ "grad_norm": 0.0005721400957554579,
357
+ "learning_rate": 0.0010855263157894736,
358
+ "loss": 0.9045,
359
+ "step": 2500
360
  },
361
  {
362
+ "epoch": 0.9853168469860897,
363
+ "grad_norm": 0.0005698847235180438,
364
+ "learning_rate": 0.00046875,
365
+ "loss": 0.893,
366
+ "step": 2550
367
+ },
368
+ {
369
+ "epoch": 1.0,
370
+ "eval_loss": 0.8954795002937317,
371
+ "eval_runtime": 1619.6937,
372
+ "eval_samples_per_second": 6.391,
373
+ "eval_steps_per_second": 0.799,
374
+ "step": 2588
375
  }
376
  ],
377
+ "logging_steps": 50,
378
+ "max_steps": 2588,
379
  "num_input_tokens_seen": 0,
380
+ "num_train_epochs": 1,
381
+ "save_steps": 50,
382
  "stateful_callbacks": {
383
  "TrainerControl": {
384
  "args": {
 
386
  "should_evaluate": false,
387
  "should_log": false,
388
  "should_save": true,
389
+ "should_training_stop": true
390
  },
391
  "attributes": {}
392
  }
393
  },
394
+ "total_flos": 6.394319248976609e+17,
395
  "train_batch_size": 4,
396
  "trial_name": null,
397
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:751a143eac84ace0878962d5a6c61e00ae90313081b3e87461034dcd220797a8
3
- size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e05980012924fba682aeee7b6335e0b05a33e8faf286e4879098fa0d40d4c691
3
+ size 5496