rosethelocalfem commited on
Commit
d1f00d3
·
verified ·
1 Parent(s): a5aa130

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- base_model: unsloth/gemma-3-1b-it-unsloth-bnb-4bit
3
  library_name: peft
4
  ---
5
 
 
1
  ---
2
+ base_model: unsloth/llama-3.2-1b-unsloth-bnb-4bit
3
  library_name: peft
4
  ---
5
 
adapter_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
- "base_model_name_or_path": "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
@@ -26,11 +26,11 @@
26
  "target_modules": [
27
  "up_proj",
28
  "down_proj",
29
- "k_proj",
30
- "v_proj",
31
- "o_proj",
32
  "gate_proj",
33
- "q_proj"
 
 
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
 
1
  {
2
  "alpha_pattern": {},
3
  "auto_mapping": null,
4
+ "base_model_name_or_path": "unsloth/llama-3.2-1b-unsloth-bnb-4bit",
5
  "bias": "none",
6
  "corda_config": null,
7
  "eva_config": null,
 
26
  "target_modules": [
27
  "up_proj",
28
  "down_proj",
29
+ "q_proj",
 
 
30
  "gate_proj",
31
+ "o_proj",
32
+ "k_proj",
33
+ "v_proj"
34
  ],
35
  "task_type": "CAUSAL_LM",
36
  "trainable_token_indices": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74bef3db54e6bac3c49f92460956081a498029372a38f71d4d5ab83bf672595f
3
- size 52231312
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:007508438510a9993b504d83e86e9d2d5d4639861498df2b660b9a76fd582e8a
3
+ size 45118424
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:43d6c1506146720c3840cb82ed1d8e4a9b1bc15ed2d95d45e25cf580ca4fd753
3
- size 26913540
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9173944cba49237a56c3639e7e239e47c013274d5737a3b3bb7c6e04773d7fde
3
+ size 23159290
scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e519e08ca9fc888c2bda67757300232e2f905bac4423928f91bead1dc00c1c3b
3
+ size 988
special_tokens_map.json CHANGED
@@ -1,30 +1,20 @@
1
  {
2
- "boi_token": "<start_of_image>",
3
  "bos_token": {
4
- "content": "<bos>",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
10
- "eoi_token": "<end_of_image>",
11
  "eos_token": {
12
- "content": "<end_of_turn>",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
16
  "single_word": false
17
  },
18
- "image_token": "<image_soft_token>",
19
  "pad_token": {
20
- "content": "<pad>",
21
- "lstrip": false,
22
- "normalized": false,
23
- "rstrip": false,
24
- "single_word": false
25
- },
26
- "unk_token": {
27
- "content": "<unk>",
28
  "lstrip": false,
29
  "normalized": false,
30
  "rstrip": false,
 
1
  {
 
2
  "bos_token": {
3
+ "content": "<|begin_of_text|>",
4
  "lstrip": false,
5
  "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
 
9
  "eos_token": {
10
+ "content": "<|end_of_text|>",
11
  "lstrip": false,
12
  "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
16
  "pad_token": {
17
+ "content": "<|finetune_right_pad_id|>",
 
 
 
 
 
 
 
18
  "lstrip": false,
19
  "normalized": false,
20
  "rstrip": false,
tokenizer.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4667f2089529e8e7657cfb6d1c19910ae71ff5f28aa7ab2ff2763330affad795
3
- size 33384568
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b
3
+ size 17209920
tokenizer_config.json CHANGED
The diff for this file is too large to render. See raw diff
 
trainer_state.json CHANGED
@@ -11,422 +11,422 @@
11
  "log_history": [
12
  {
13
  "epoch": 0.0009274287039183863,
14
- "grad_norm": 4.868922710418701,
15
  "learning_rate": 0.0,
16
- "loss": 5.2325,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.0018548574078367725,
21
- "grad_norm": 5.811268329620361,
22
  "learning_rate": 5e-06,
23
- "loss": 4.7512,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.0027822861117551586,
28
- "grad_norm": 1.8592679500579834,
29
  "learning_rate": 1e-05,
30
- "loss": 3.6976,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.003709714815673545,
35
- "grad_norm": 3.553828001022339,
36
  "learning_rate": 1.5e-05,
37
- "loss": 4.4847,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.004637143519591932,
42
- "grad_norm": 3.7580947875976562,
43
  "learning_rate": 2e-05,
44
- "loss": 4.3681,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.005564572223510317,
49
- "grad_norm": 6.560013294219971,
50
  "learning_rate": 2.5e-05,
51
- "loss": 5.4142,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.006492000927428704,
56
- "grad_norm": 5.0084710121154785,
57
  "learning_rate": 3e-05,
58
- "loss": 4.8401,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.00741942963134709,
63
- "grad_norm": 2.0739693641662598,
64
  "learning_rate": 3.5e-05,
65
- "loss": 3.9564,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.008346858335265477,
70
- "grad_norm": 5.008288383483887,
71
  "learning_rate": 4e-05,
72
- "loss": 4.901,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.009274287039183864,
77
- "grad_norm": 1.2743533849716187,
78
  "learning_rate": 4.5e-05,
79
- "loss": 3.7309,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.010201715743102248,
84
- "grad_norm": 1.1797062158584595,
85
  "learning_rate": 5e-05,
86
- "loss": 3.56,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.011129144447020635,
91
- "grad_norm": 3.787964344024658,
92
  "learning_rate": 4.9e-05,
93
- "loss": 5.1572,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.012056573150939021,
98
- "grad_norm": 1.1445714235305786,
99
  "learning_rate": 4.8e-05,
100
- "loss": 3.5998,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.012984001854857407,
105
- "grad_norm": 1.2640348672866821,
106
  "learning_rate": 4.7e-05,
107
- "loss": 3.6054,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.013911430558775794,
112
- "grad_norm": 0.9739102721214294,
113
  "learning_rate": 4.600000000000001e-05,
114
- "loss": 3.4424,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.01483885926269418,
119
- "grad_norm": 3.6449685096740723,
120
  "learning_rate": 4.5e-05,
121
- "loss": 5.1529,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.015766287966612568,
126
- "grad_norm": 1.0862540006637573,
127
  "learning_rate": 4.4000000000000006e-05,
128
- "loss": 3.4454,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.016693716670530954,
133
- "grad_norm": 2.713123083114624,
134
  "learning_rate": 4.3e-05,
135
- "loss": 4.4682,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.01762114537444934,
140
- "grad_norm": 1.0332715511322021,
141
  "learning_rate": 4.2e-05,
142
- "loss": 3.3278,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 0.018548574078367727,
147
- "grad_norm": 1.1742037534713745,
148
  "learning_rate": 4.1e-05,
149
- "loss": 3.5884,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 0.01947600278228611,
154
- "grad_norm": 1.1539186239242554,
155
  "learning_rate": 4e-05,
156
- "loss": 3.6538,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 0.020403431486204496,
161
- "grad_norm": 1.303881287574768,
162
  "learning_rate": 3.9000000000000006e-05,
163
- "loss": 3.387,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 0.021330860190122883,
168
- "grad_norm": 1.8081673383712769,
169
  "learning_rate": 3.8e-05,
170
- "loss": 3.7469,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 0.02225828889404127,
175
- "grad_norm": 1.0129601955413818,
176
  "learning_rate": 3.7e-05,
177
- "loss": 3.0917,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 0.023185717597959656,
182
- "grad_norm": 2.841965436935425,
183
  "learning_rate": 3.6e-05,
184
- "loss": 3.6627,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 0.024113146301878042,
189
- "grad_norm": 2.4179654121398926,
190
  "learning_rate": 3.5e-05,
191
- "loss": 3.8892,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 0.025040575005796428,
196
- "grad_norm": 1.5163795948028564,
197
  "learning_rate": 3.4000000000000007e-05,
198
- "loss": 3.2586,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 0.025968003709714815,
203
- "grad_norm": 2.0588231086730957,
204
  "learning_rate": 3.3e-05,
205
- "loss": 3.7464,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 0.0268954324136332,
210
- "grad_norm": 1.082326889038086,
211
  "learning_rate": 3.2000000000000005e-05,
212
- "loss": 3.5218,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 0.027822861117551587,
217
- "grad_norm": 3.831888437271118,
218
  "learning_rate": 3.1e-05,
219
- "loss": 4.0704,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 0.028750289821469974,
224
- "grad_norm": 1.2819643020629883,
225
  "learning_rate": 3e-05,
226
- "loss": 3.5,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 0.02967771852538836,
231
- "grad_norm": 2.7546770572662354,
232
  "learning_rate": 2.9e-05,
233
- "loss": 3.6709,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 0.030605147229306746,
238
- "grad_norm": 2.127004861831665,
239
  "learning_rate": 2.8000000000000003e-05,
240
- "loss": 3.7116,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 0.031532575933225136,
245
- "grad_norm": 1.5426557064056396,
246
  "learning_rate": 2.7000000000000002e-05,
247
- "loss": 3.7935,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 0.03246000463714352,
252
- "grad_norm": 1.4559600353240967,
253
  "learning_rate": 2.6000000000000002e-05,
254
- "loss": 3.2505,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 0.03338743334106191,
259
- "grad_norm": 1.2187875509262085,
260
  "learning_rate": 2.5e-05,
261
- "loss": 3.3403,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 0.03431486204498029,
266
- "grad_norm": 1.4153465032577515,
267
  "learning_rate": 2.4e-05,
268
- "loss": 2.73,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 0.03524229074889868,
273
- "grad_norm": 1.2564786672592163,
274
  "learning_rate": 2.3000000000000003e-05,
275
- "loss": 3.3627,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 0.036169719452817065,
280
- "grad_norm": 3.221158742904663,
281
  "learning_rate": 2.2000000000000003e-05,
282
- "loss": 3.6366,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 0.037097148156735454,
287
- "grad_norm": 1.5206141471862793,
288
  "learning_rate": 2.1e-05,
289
- "loss": 3.5083,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 0.03802457686065384,
294
- "grad_norm": 3.3207788467407227,
295
  "learning_rate": 2e-05,
296
- "loss": 3.6526,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 0.03895200556457222,
301
- "grad_norm": 2.6947567462921143,
302
  "learning_rate": 1.9e-05,
303
- "loss": 3.8998,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 0.03987943426849061,
308
- "grad_norm": 3.0442309379577637,
309
  "learning_rate": 1.8e-05,
310
- "loss": 4.0098,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 0.04080686297240899,
315
- "grad_norm": 3.0149142742156982,
316
  "learning_rate": 1.7000000000000003e-05,
317
- "loss": 3.6688,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 0.04173429167632738,
322
- "grad_norm": 2.458320140838623,
323
  "learning_rate": 1.6000000000000003e-05,
324
- "loss": 3.7055,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 0.042661720380245766,
329
- "grad_norm": 1.5240652561187744,
330
  "learning_rate": 1.5e-05,
331
- "loss": 3.2421,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 0.043589149084164155,
336
- "grad_norm": 5.896821975708008,
337
  "learning_rate": 1.4000000000000001e-05,
338
- "loss": 4.7299,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 0.04451657778808254,
343
- "grad_norm": 1.3863835334777832,
344
  "learning_rate": 1.3000000000000001e-05,
345
- "loss": 3.3415,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 0.04544400649200093,
350
- "grad_norm": 1.5955193042755127,
351
  "learning_rate": 1.2e-05,
352
- "loss": 3.3076,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 0.04637143519591931,
357
- "grad_norm": 2.4756007194519043,
358
  "learning_rate": 1.1000000000000001e-05,
359
- "loss": 3.8448,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 0.0472988638998377,
364
- "grad_norm": 1.1582512855529785,
365
  "learning_rate": 1e-05,
366
- "loss": 3.2583,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 0.048226292603756084,
371
- "grad_norm": 2.394286632537842,
372
  "learning_rate": 9e-06,
373
- "loss": 3.3061,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 0.049153721307674474,
378
- "grad_norm": 1.4328668117523193,
379
  "learning_rate": 8.000000000000001e-06,
380
- "loss": 3.1576,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 0.050081150011592857,
385
- "grad_norm": 1.2915327548980713,
386
  "learning_rate": 7.000000000000001e-06,
387
- "loss": 3.0897,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 0.051008578715511246,
392
- "grad_norm": 1.9309310913085938,
393
  "learning_rate": 6e-06,
394
- "loss": 3.372,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 0.05193600741942963,
399
- "grad_norm": 3.784912586212158,
400
  "learning_rate": 5e-06,
401
- "loss": 3.7925,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 0.05286343612334802,
406
- "grad_norm": 9.921348571777344,
407
  "learning_rate": 4.000000000000001e-06,
408
- "loss": 5.3381,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 0.0537908648272664,
413
- "grad_norm": 1.3432551622390747,
414
  "learning_rate": 3e-06,
415
- "loss": 3.543,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 0.05471829353118479,
420
- "grad_norm": 2.20005202293396,
421
  "learning_rate": 2.0000000000000003e-06,
422
- "loss": 3.658,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 0.055645722235103175,
427
- "grad_norm": 1.4739794731140137,
428
  "learning_rate": 1.0000000000000002e-06,
429
- "loss": 3.258,
430
  "step": 60
431
  }
432
  ],
@@ -447,7 +447,7 @@
447
  "attributes": {}
448
  }
449
  },
450
- "total_flos": 715238756384256.0,
451
  "train_batch_size": 2,
452
  "trial_name": null,
453
  "trial_params": null
 
11
  "log_history": [
12
  {
13
  "epoch": 0.0009274287039183863,
14
+ "grad_norm": 1.411400556564331,
15
  "learning_rate": 0.0,
16
+ "loss": 4.3229,
17
  "step": 1
18
  },
19
  {
20
  "epoch": 0.0018548574078367725,
21
+ "grad_norm": 2.266972064971924,
22
  "learning_rate": 5e-06,
23
+ "loss": 4.4344,
24
  "step": 2
25
  },
26
  {
27
  "epoch": 0.0027822861117551586,
28
+ "grad_norm": 0.6068131923675537,
29
  "learning_rate": 1e-05,
30
+ "loss": 3.2018,
31
  "step": 3
32
  },
33
  {
34
  "epoch": 0.003709714815673545,
35
+ "grad_norm": 0.9275912642478943,
36
  "learning_rate": 1.5e-05,
37
+ "loss": 4.0208,
38
  "step": 4
39
  },
40
  {
41
  "epoch": 0.004637143519591932,
42
+ "grad_norm": 0.9753095507621765,
43
  "learning_rate": 2e-05,
44
+ "loss": 3.7113,
45
  "step": 5
46
  },
47
  {
48
  "epoch": 0.005564572223510317,
49
+ "grad_norm": 2.6469643115997314,
50
  "learning_rate": 2.5e-05,
51
+ "loss": 5.2115,
52
  "step": 6
53
  },
54
  {
55
  "epoch": 0.006492000927428704,
56
+ "grad_norm": 1.4554330110549927,
57
  "learning_rate": 3e-05,
58
+ "loss": 4.2436,
59
  "step": 7
60
  },
61
  {
62
  "epoch": 0.00741942963134709,
63
+ "grad_norm": 0.5066649913787842,
64
  "learning_rate": 3.5e-05,
65
+ "loss": 3.5398,
66
  "step": 8
67
  },
68
  {
69
  "epoch": 0.008346858335265477,
70
+ "grad_norm": 2.082305908203125,
71
  "learning_rate": 4e-05,
72
+ "loss": 4.8127,
73
  "step": 9
74
  },
75
  {
76
  "epoch": 0.009274287039183864,
77
+ "grad_norm": 0.4055868983268738,
78
  "learning_rate": 4.5e-05,
79
+ "loss": 3.3663,
80
  "step": 10
81
  },
82
  {
83
  "epoch": 0.010201715743102248,
84
+ "grad_norm": 0.363479882478714,
85
  "learning_rate": 5e-05,
86
+ "loss": 3.2632,
87
  "step": 11
88
  },
89
  {
90
  "epoch": 0.011129144447020635,
91
+ "grad_norm": 1.7619837522506714,
92
  "learning_rate": 4.9e-05,
93
+ "loss": 4.7703,
94
  "step": 12
95
  },
96
  {
97
  "epoch": 0.012056573150939021,
98
+ "grad_norm": 0.48136666417121887,
99
  "learning_rate": 4.8e-05,
100
+ "loss": 3.1614,
101
  "step": 13
102
  },
103
  {
104
  "epoch": 0.012984001854857407,
105
+ "grad_norm": 0.48472875356674194,
106
  "learning_rate": 4.7e-05,
107
+ "loss": 3.3447,
108
  "step": 14
109
  },
110
  {
111
  "epoch": 0.013911430558775794,
112
+ "grad_norm": 0.5840467214584351,
113
  "learning_rate": 4.600000000000001e-05,
114
+ "loss": 3.3037,
115
  "step": 15
116
  },
117
  {
118
  "epoch": 0.01483885926269418,
119
+ "grad_norm": 1.9077085256576538,
120
  "learning_rate": 4.5e-05,
121
+ "loss": 4.614,
122
  "step": 16
123
  },
124
  {
125
  "epoch": 0.015766287966612568,
126
+ "grad_norm": 0.5734739899635315,
127
  "learning_rate": 4.4000000000000006e-05,
128
+ "loss": 3.1754,
129
  "step": 17
130
  },
131
  {
132
  "epoch": 0.016693716670530954,
133
+ "grad_norm": 1.8598120212554932,
134
  "learning_rate": 4.3e-05,
135
+ "loss": 4.3064,
136
  "step": 18
137
  },
138
  {
139
  "epoch": 0.01762114537444934,
140
+ "grad_norm": 0.5264394283294678,
141
  "learning_rate": 4.2e-05,
142
+ "loss": 3.2004,
143
  "step": 19
144
  },
145
  {
146
  "epoch": 0.018548574078367727,
147
+ "grad_norm": 0.6078647375106812,
148
  "learning_rate": 4.1e-05,
149
+ "loss": 3.3305,
150
  "step": 20
151
  },
152
  {
153
  "epoch": 0.01947600278228611,
154
+ "grad_norm": 0.49842751026153564,
155
  "learning_rate": 4e-05,
156
+ "loss": 3.4103,
157
  "step": 21
158
  },
159
  {
160
  "epoch": 0.020403431486204496,
161
+ "grad_norm": 0.6825811862945557,
162
  "learning_rate": 3.9000000000000006e-05,
163
+ "loss": 3.4553,
164
  "step": 22
165
  },
166
  {
167
  "epoch": 0.021330860190122883,
168
+ "grad_norm": 1.0185341835021973,
169
  "learning_rate": 3.8e-05,
170
+ "loss": 3.6376,
171
  "step": 23
172
  },
173
  {
174
  "epoch": 0.02225828889404127,
175
+ "grad_norm": 0.3887212574481964,
176
  "learning_rate": 3.7e-05,
177
+ "loss": 2.9089,
178
  "step": 24
179
  },
180
  {
181
  "epoch": 0.023185717597959656,
182
+ "grad_norm": 1.5926904678344727,
183
  "learning_rate": 3.6e-05,
184
+ "loss": 3.6112,
185
  "step": 25
186
  },
187
  {
188
  "epoch": 0.024113146301878042,
189
+ "grad_norm": 1.4020466804504395,
190
  "learning_rate": 3.5e-05,
191
+ "loss": 3.8137,
192
  "step": 26
193
  },
194
  {
195
  "epoch": 0.025040575005796428,
196
+ "grad_norm": 0.8697665929794312,
197
  "learning_rate": 3.4000000000000007e-05,
198
+ "loss": 3.3651,
199
  "step": 27
200
  },
201
  {
202
  "epoch": 0.025968003709714815,
203
+ "grad_norm": 1.1937010288238525,
204
  "learning_rate": 3.3e-05,
205
+ "loss": 3.7935,
206
  "step": 28
207
  },
208
  {
209
  "epoch": 0.0268954324136332,
210
+ "grad_norm": 0.583304762840271,
211
  "learning_rate": 3.2000000000000005e-05,
212
+ "loss": 3.279,
213
  "step": 29
214
  },
215
  {
216
  "epoch": 0.027822861117551587,
217
+ "grad_norm": 2.1346917152404785,
218
  "learning_rate": 3.1e-05,
219
+ "loss": 4.1021,
220
  "step": 30
221
  },
222
  {
223
  "epoch": 0.028750289821469974,
224
+ "grad_norm": 0.7281085848808289,
225
  "learning_rate": 3e-05,
226
+ "loss": 3.3558,
227
  "step": 31
228
  },
229
  {
230
  "epoch": 0.02967771852538836,
231
+ "grad_norm": 1.7035914659500122,
232
  "learning_rate": 2.9e-05,
233
+ "loss": 3.5765,
234
  "step": 32
235
  },
236
  {
237
  "epoch": 0.030605147229306746,
238
+ "grad_norm": 1.3239504098892212,
239
  "learning_rate": 2.8000000000000003e-05,
240
+ "loss": 3.6181,
241
  "step": 33
242
  },
243
  {
244
  "epoch": 0.031532575933225136,
245
+ "grad_norm": 0.9565314650535583,
246
  "learning_rate": 2.7000000000000002e-05,
247
+ "loss": 3.5135,
248
  "step": 34
249
  },
250
  {
251
  "epoch": 0.03246000463714352,
252
+ "grad_norm": 1.0347371101379395,
253
  "learning_rate": 2.6000000000000002e-05,
254
+ "loss": 3.2434,
255
  "step": 35
256
  },
257
  {
258
  "epoch": 0.03338743334106191,
259
+ "grad_norm": 0.54816734790802,
260
  "learning_rate": 2.5e-05,
261
+ "loss": 3.1409,
262
  "step": 36
263
  },
264
  {
265
  "epoch": 0.03431486204498029,
266
+ "grad_norm": 0.4945932626724243,
267
  "learning_rate": 2.4e-05,
268
+ "loss": 2.9236,
269
  "step": 37
270
  },
271
  {
272
  "epoch": 0.03524229074889868,
273
+ "grad_norm": 0.48317691683769226,
274
  "learning_rate": 2.3000000000000003e-05,
275
+ "loss": 3.3575,
276
  "step": 38
277
  },
278
  {
279
  "epoch": 0.036169719452817065,
280
+ "grad_norm": 1.2033950090408325,
281
  "learning_rate": 2.2000000000000003e-05,
282
+ "loss": 3.6258,
283
  "step": 39
284
  },
285
  {
286
  "epoch": 0.037097148156735454,
287
+ "grad_norm": 1.137432336807251,
288
  "learning_rate": 2.1e-05,
289
+ "loss": 3.5354,
290
  "step": 40
291
  },
292
  {
293
  "epoch": 0.03802457686065384,
294
+ "grad_norm": 1.494925856590271,
295
  "learning_rate": 2e-05,
296
+ "loss": 3.6365,
297
  "step": 41
298
  },
299
  {
300
  "epoch": 0.03895200556457222,
301
+ "grad_norm": 1.1683684587478638,
302
  "learning_rate": 1.9e-05,
303
+ "loss": 3.7584,
304
  "step": 42
305
  },
306
  {
307
  "epoch": 0.03987943426849061,
308
+ "grad_norm": 1.5439304113388062,
309
  "learning_rate": 1.8e-05,
310
+ "loss": 3.7813,
311
  "step": 43
312
  },
313
  {
314
  "epoch": 0.04080686297240899,
315
+ "grad_norm": 1.4335922002792358,
316
  "learning_rate": 1.7000000000000003e-05,
317
+ "loss": 3.5158,
318
  "step": 44
319
  },
320
  {
321
  "epoch": 0.04173429167632738,
322
+ "grad_norm": 1.0030946731567383,
323
  "learning_rate": 1.6000000000000003e-05,
324
+ "loss": 3.4701,
325
  "step": 45
326
  },
327
  {
328
  "epoch": 0.042661720380245766,
329
+ "grad_norm": 0.8070414662361145,
330
  "learning_rate": 1.5e-05,
331
+ "loss": 3.259,
332
  "step": 46
333
  },
334
  {
335
  "epoch": 0.043589149084164155,
336
+ "grad_norm": 1.9914580583572388,
337
  "learning_rate": 1.4000000000000001e-05,
338
+ "loss": 4.3585,
339
  "step": 47
340
  },
341
  {
342
  "epoch": 0.04451657778808254,
343
+ "grad_norm": 0.6409067511558533,
344
  "learning_rate": 1.3000000000000001e-05,
345
+ "loss": 3.3393,
346
  "step": 48
347
  },
348
  {
349
  "epoch": 0.04544400649200093,
350
+ "grad_norm": 0.910348117351532,
351
  "learning_rate": 1.2e-05,
352
+ "loss": 3.4981,
353
  "step": 49
354
  },
355
  {
356
  "epoch": 0.04637143519591931,
357
+ "grad_norm": 1.3223044872283936,
358
  "learning_rate": 1.1000000000000001e-05,
359
+ "loss": 3.6996,
360
  "step": 50
361
  },
362
  {
363
  "epoch": 0.0472988638998377,
364
+ "grad_norm": 0.3819347620010376,
365
  "learning_rate": 1e-05,
366
+ "loss": 2.9741,
367
  "step": 51
368
  },
369
  {
370
  "epoch": 0.048226292603756084,
371
+ "grad_norm": 1.3223705291748047,
372
  "learning_rate": 9e-06,
373
+ "loss": 3.7789,
374
  "step": 52
375
  },
376
  {
377
  "epoch": 0.049153721307674474,
378
+ "grad_norm": 0.6920037865638733,
379
  "learning_rate": 8.000000000000001e-06,
380
+ "loss": 3.1836,
381
  "step": 53
382
  },
383
  {
384
  "epoch": 0.050081150011592857,
385
+ "grad_norm": 0.6605297923088074,
386
  "learning_rate": 7.000000000000001e-06,
387
+ "loss": 2.998,
388
  "step": 54
389
  },
390
  {
391
  "epoch": 0.051008578715511246,
392
+ "grad_norm": 0.8535223603248596,
393
  "learning_rate": 6e-06,
394
+ "loss": 3.1944,
395
  "step": 55
396
  },
397
  {
398
  "epoch": 0.05193600741942963,
399
+ "grad_norm": 1.4277936220169067,
400
  "learning_rate": 5e-06,
401
+ "loss": 3.6891,
402
  "step": 56
403
  },
404
  {
405
  "epoch": 0.05286343612334802,
406
+ "grad_norm": 3.182830572128296,
407
  "learning_rate": 4.000000000000001e-06,
408
+ "loss": 5.1859,
409
  "step": 57
410
  },
411
  {
412
  "epoch": 0.0537908648272664,
413
+ "grad_norm": 0.6000503301620483,
414
  "learning_rate": 3e-06,
415
+ "loss": 3.3717,
416
  "step": 58
417
  },
418
  {
419
  "epoch": 0.05471829353118479,
420
+ "grad_norm": 1.301562786102295,
421
  "learning_rate": 2.0000000000000003e-06,
422
+ "loss": 3.4226,
423
  "step": 59
424
  },
425
  {
426
  "epoch": 0.055645722235103175,
427
+ "grad_norm": 0.7561865448951721,
428
  "learning_rate": 1.0000000000000002e-06,
429
+ "loss": 3.0751,
430
  "step": 60
431
  }
432
  ],
 
447
  "attributes": {}
448
  }
449
  },
450
+ "total_flos": 951810432098304.0,
451
  "train_batch_size": 2,
452
  "trial_name": null,
453
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c553330158f5669e0fd93f1ff21b7b08ba11a6a294989ca3ba92a564ae5ca241
3
  size 5560
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78092493866cfdb6e6800bbb6a7531765b620f13789e657a9469b94c1b6b0fc4
3
  size 5560