AhilanPonnusamy commited on
Commit
c29b9d8
·
verified ·
1 Parent(s): 8bb6a04

Upload folder using huggingface_hub

Browse files
Files changed (7) hide show
  1. config.json +43 -0
  2. model.safetensors +3 -0
  3. optimizer.pt +3 -0
  4. rng_state.pth +3 -0
  5. scheduler.pt +3 -0
  6. trainer_state.json +1405 -0
  7. training_args.bin +3 -0
config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "./smollm-135M",
3
+ "_num_labels": 3,
4
+ "architectures": [
5
+ "LlamaForSequenceClassification"
6
+ ],
7
+ "attention_bias": false,
8
+ "attention_dropout": 0.0,
9
+ "bos_token_id": 0,
10
+ "eos_token_id": 0,
11
+ "head_dim": 64,
12
+ "hidden_act": "silu",
13
+ "hidden_size": 576,
14
+ "id2label": {
15
+ "0": "negative",
16
+ "1": "neutral",
17
+ "2": "positive"
18
+ },
19
+ "initializer_range": 0.02,
20
+ "intermediate_size": 1536,
21
+ "label2id": {
22
+ "negative": 0,
23
+ "neutral": 1,
24
+ "positive": 2
25
+ },
26
+ "max_position_embeddings": 2048,
27
+ "mlp_bias": false,
28
+ "model_type": "llama",
29
+ "num_attention_heads": 9,
30
+ "num_hidden_layers": 30,
31
+ "num_key_value_heads": 3,
32
+ "pad_token_id": 0,
33
+ "pretraining_tp": 1,
34
+ "problem_type": "single_label_classification",
35
+ "rms_norm_eps": 1e-05,
36
+ "rope_scaling": null,
37
+ "rope_theta": 10000.0,
38
+ "tie_word_embeddings": true,
39
+ "torch_dtype": "float32",
40
+ "transformers_version": "4.48.0",
41
+ "use_cache": true,
42
+ "vocab_size": 49152
43
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:10bb69df21ac54718dec2196ab81e4475b41f8cedcab8728efdcf6d706466120
3
+ size 538097400
optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a5b1e201a3a7011b6d538c88e32709395bb13363b480ad94da54a894fab81d4
3
+ size 1076356555
rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:391d01d3aeb4a35151817d446e4ba0b9c8a04084ae1b1b66eda188a30729da0a
3
+ size 14455
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a41615828fcc7147e89faeb04aac00fba7451f6a1f160d02a50a764db727d7a7
3
+ size 1465
trainer_state.json ADDED
@@ -0,0 +1,1405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.014622284099459648,
3
+ "best_model_checkpoint": "sentiment-distillation-smollm/checkpoint-1875",
4
+ "epoch": 3.0,
5
+ "eval_steps": 500,
6
+ "global_step": 1875,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.016,
13
+ "grad_norm": 9.476157188415527,
14
+ "learning_rate": 1.597444089456869e-06,
15
+ "loss": 1.074,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.032,
20
+ "grad_norm": 10.569486618041992,
21
+ "learning_rate": 3.194888178913738e-06,
22
+ "loss": 1.0372,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.048,
27
+ "grad_norm": 8.480586051940918,
28
+ "learning_rate": 4.792332268370607e-06,
29
+ "loss": 0.9365,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.064,
34
+ "grad_norm": 6.626020908355713,
35
+ "learning_rate": 6.389776357827476e-06,
36
+ "loss": 0.8842,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.08,
41
+ "grad_norm": 12.755849838256836,
42
+ "learning_rate": 7.987220447284345e-06,
43
+ "loss": 0.9098,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.096,
48
+ "grad_norm": 7.676151275634766,
49
+ "learning_rate": 9.584664536741214e-06,
50
+ "loss": 0.7348,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.112,
55
+ "grad_norm": 5.426048278808594,
56
+ "learning_rate": 1.1182108626198083e-05,
57
+ "loss": 0.4936,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.128,
62
+ "grad_norm": 3.9603002071380615,
63
+ "learning_rate": 1.2779552715654951e-05,
64
+ "loss": 0.3438,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.144,
69
+ "grad_norm": 3.396127700805664,
70
+ "learning_rate": 1.4376996805111822e-05,
71
+ "loss": 0.1482,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.16,
76
+ "grad_norm": 4.765463829040527,
77
+ "learning_rate": 1.597444089456869e-05,
78
+ "loss": 0.1855,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.176,
83
+ "grad_norm": 0.04306459426879883,
84
+ "learning_rate": 1.757188498402556e-05,
85
+ "loss": 0.1788,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.192,
90
+ "grad_norm": 0.04322272166609764,
91
+ "learning_rate": 1.9169329073482428e-05,
92
+ "loss": 0.1233,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.208,
97
+ "grad_norm": 0.053582437336444855,
98
+ "learning_rate": 2.07667731629393e-05,
99
+ "loss": 0.0908,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.224,
104
+ "grad_norm": 8.254273414611816,
105
+ "learning_rate": 2.2364217252396165e-05,
106
+ "loss": 0.1362,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.24,
111
+ "grad_norm": 0.054730091243982315,
112
+ "learning_rate": 2.3961661341853036e-05,
113
+ "loss": 0.072,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.256,
118
+ "grad_norm": 0.0763891264796257,
119
+ "learning_rate": 2.5559105431309903e-05,
120
+ "loss": 0.1823,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.272,
125
+ "grad_norm": 6.117145538330078,
126
+ "learning_rate": 2.7156549520766773e-05,
127
+ "loss": 0.0963,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.288,
132
+ "grad_norm": 1.8248779773712158,
133
+ "learning_rate": 2.8753993610223644e-05,
134
+ "loss": 0.2741,
135
+ "step": 180
136
+ },
137
+ {
138
+ "epoch": 0.304,
139
+ "grad_norm": 0.014617039822041988,
140
+ "learning_rate": 3.0351437699680514e-05,
141
+ "loss": 0.0691,
142
+ "step": 190
143
+ },
144
+ {
145
+ "epoch": 0.32,
146
+ "grad_norm": 2.316850423812866,
147
+ "learning_rate": 3.194888178913738e-05,
148
+ "loss": 0.0186,
149
+ "step": 200
150
+ },
151
+ {
152
+ "epoch": 0.336,
153
+ "grad_norm": 0.010957750491797924,
154
+ "learning_rate": 3.354632587859425e-05,
155
+ "loss": 0.1676,
156
+ "step": 210
157
+ },
158
+ {
159
+ "epoch": 0.352,
160
+ "grad_norm": 0.02525000460445881,
161
+ "learning_rate": 3.514376996805112e-05,
162
+ "loss": 0.1689,
163
+ "step": 220
164
+ },
165
+ {
166
+ "epoch": 0.368,
167
+ "grad_norm": 0.010558371432125568,
168
+ "learning_rate": 3.6741214057507985e-05,
169
+ "loss": 0.1759,
170
+ "step": 230
171
+ },
172
+ {
173
+ "epoch": 0.384,
174
+ "grad_norm": 0.009851646609604359,
175
+ "learning_rate": 3.8338658146964856e-05,
176
+ "loss": 0.1253,
177
+ "step": 240
178
+ },
179
+ {
180
+ "epoch": 0.4,
181
+ "grad_norm": 0.009924142621457577,
182
+ "learning_rate": 3.9936102236421726e-05,
183
+ "loss": 0.1141,
184
+ "step": 250
185
+ },
186
+ {
187
+ "epoch": 0.416,
188
+ "grad_norm": 0.00981380045413971,
189
+ "learning_rate": 4.15335463258786e-05,
190
+ "loss": 0.1227,
191
+ "step": 260
192
+ },
193
+ {
194
+ "epoch": 0.432,
195
+ "grad_norm": 4.685519218444824,
196
+ "learning_rate": 4.313099041533547e-05,
197
+ "loss": 0.2,
198
+ "step": 270
199
+ },
200
+ {
201
+ "epoch": 0.448,
202
+ "grad_norm": 1.1163793802261353,
203
+ "learning_rate": 4.472843450479233e-05,
204
+ "loss": 0.073,
205
+ "step": 280
206
+ },
207
+ {
208
+ "epoch": 0.464,
209
+ "grad_norm": 0.01022783387452364,
210
+ "learning_rate": 4.632587859424921e-05,
211
+ "loss": 0.1287,
212
+ "step": 290
213
+ },
214
+ {
215
+ "epoch": 0.48,
216
+ "grad_norm": 0.018457185477018356,
217
+ "learning_rate": 4.792332268370607e-05,
218
+ "loss": 0.0794,
219
+ "step": 300
220
+ },
221
+ {
222
+ "epoch": 0.496,
223
+ "grad_norm": 10.666505813598633,
224
+ "learning_rate": 4.952076677316294e-05,
225
+ "loss": 0.17,
226
+ "step": 310
227
+ },
228
+ {
229
+ "epoch": 0.512,
230
+ "grad_norm": 2.132030725479126,
231
+ "learning_rate": 4.987553342816501e-05,
232
+ "loss": 0.0952,
233
+ "step": 320
234
+ },
235
+ {
236
+ "epoch": 0.528,
237
+ "grad_norm": 11.730767250061035,
238
+ "learning_rate": 4.969772403982931e-05,
239
+ "loss": 0.1995,
240
+ "step": 330
241
+ },
242
+ {
243
+ "epoch": 0.544,
244
+ "grad_norm": 0.01148867979645729,
245
+ "learning_rate": 4.95199146514936e-05,
246
+ "loss": 0.1019,
247
+ "step": 340
248
+ },
249
+ {
250
+ "epoch": 0.56,
251
+ "grad_norm": 0.047348447144031525,
252
+ "learning_rate": 4.9342105263157894e-05,
253
+ "loss": 0.0706,
254
+ "step": 350
255
+ },
256
+ {
257
+ "epoch": 0.576,
258
+ "grad_norm": 0.010151210241019726,
259
+ "learning_rate": 4.916429587482219e-05,
260
+ "loss": 0.0878,
261
+ "step": 360
262
+ },
263
+ {
264
+ "epoch": 0.592,
265
+ "grad_norm": 0.4587936997413635,
266
+ "learning_rate": 4.8986486486486486e-05,
267
+ "loss": 0.0864,
268
+ "step": 370
269
+ },
270
+ {
271
+ "epoch": 0.608,
272
+ "grad_norm": 0.1231539249420166,
273
+ "learning_rate": 4.8808677098150786e-05,
274
+ "loss": 0.0096,
275
+ "step": 380
276
+ },
277
+ {
278
+ "epoch": 0.624,
279
+ "grad_norm": 0.005615161266177893,
280
+ "learning_rate": 4.863086770981508e-05,
281
+ "loss": 0.1351,
282
+ "step": 390
283
+ },
284
+ {
285
+ "epoch": 0.64,
286
+ "grad_norm": 0.02301601506769657,
287
+ "learning_rate": 4.845305832147938e-05,
288
+ "loss": 0.0363,
289
+ "step": 400
290
+ },
291
+ {
292
+ "epoch": 0.656,
293
+ "grad_norm": 3.3105900287628174,
294
+ "learning_rate": 4.827524893314367e-05,
295
+ "loss": 0.0764,
296
+ "step": 410
297
+ },
298
+ {
299
+ "epoch": 0.672,
300
+ "grad_norm": 0.0835251584649086,
301
+ "learning_rate": 4.809743954480797e-05,
302
+ "loss": 0.0238,
303
+ "step": 420
304
+ },
305
+ {
306
+ "epoch": 0.688,
307
+ "grad_norm": 0.0839819684624672,
308
+ "learning_rate": 4.7919630156472264e-05,
309
+ "loss": 0.1182,
310
+ "step": 430
311
+ },
312
+ {
313
+ "epoch": 0.704,
314
+ "grad_norm": 0.0160073135048151,
315
+ "learning_rate": 4.774182076813656e-05,
316
+ "loss": 0.1647,
317
+ "step": 440
318
+ },
319
+ {
320
+ "epoch": 0.72,
321
+ "grad_norm": 0.09903815388679504,
322
+ "learning_rate": 4.756401137980086e-05,
323
+ "loss": 0.138,
324
+ "step": 450
325
+ },
326
+ {
327
+ "epoch": 0.736,
328
+ "grad_norm": 8.14660930633545,
329
+ "learning_rate": 4.738620199146515e-05,
330
+ "loss": 0.0731,
331
+ "step": 460
332
+ },
333
+ {
334
+ "epoch": 0.752,
335
+ "grad_norm": 0.07308264076709747,
336
+ "learning_rate": 4.720839260312945e-05,
337
+ "loss": 0.017,
338
+ "step": 470
339
+ },
340
+ {
341
+ "epoch": 0.768,
342
+ "grad_norm": 0.8220946788787842,
343
+ "learning_rate": 4.703058321479374e-05,
344
+ "loss": 0.0299,
345
+ "step": 480
346
+ },
347
+ {
348
+ "epoch": 0.784,
349
+ "grad_norm": 0.007219772785902023,
350
+ "learning_rate": 4.685277382645804e-05,
351
+ "loss": 0.0162,
352
+ "step": 490
353
+ },
354
+ {
355
+ "epoch": 0.8,
356
+ "grad_norm": 0.009510455653071404,
357
+ "learning_rate": 4.6674964438122335e-05,
358
+ "loss": 0.0011,
359
+ "step": 500
360
+ },
361
+ {
362
+ "epoch": 0.816,
363
+ "grad_norm": 9.018925666809082,
364
+ "learning_rate": 4.6497155049786634e-05,
365
+ "loss": 0.1081,
366
+ "step": 510
367
+ },
368
+ {
369
+ "epoch": 0.832,
370
+ "grad_norm": 0.008953461423516273,
371
+ "learning_rate": 4.631934566145093e-05,
372
+ "loss": 0.0494,
373
+ "step": 520
374
+ },
375
+ {
376
+ "epoch": 0.848,
377
+ "grad_norm": 10.596187591552734,
378
+ "learning_rate": 4.614153627311522e-05,
379
+ "loss": 0.0158,
380
+ "step": 530
381
+ },
382
+ {
383
+ "epoch": 0.864,
384
+ "grad_norm": 2.3671252727508545,
385
+ "learning_rate": 4.596372688477952e-05,
386
+ "loss": 0.0816,
387
+ "step": 540
388
+ },
389
+ {
390
+ "epoch": 0.88,
391
+ "grad_norm": 0.002445698482915759,
392
+ "learning_rate": 4.578591749644381e-05,
393
+ "loss": 0.0042,
394
+ "step": 550
395
+ },
396
+ {
397
+ "epoch": 0.896,
398
+ "grad_norm": 11.779576301574707,
399
+ "learning_rate": 4.560810810810811e-05,
400
+ "loss": 0.1067,
401
+ "step": 560
402
+ },
403
+ {
404
+ "epoch": 0.912,
405
+ "grad_norm": 7.589051246643066,
406
+ "learning_rate": 4.5430298719772405e-05,
407
+ "loss": 0.0853,
408
+ "step": 570
409
+ },
410
+ {
411
+ "epoch": 0.928,
412
+ "grad_norm": 0.11888572573661804,
413
+ "learning_rate": 4.5252489331436705e-05,
414
+ "loss": 0.0317,
415
+ "step": 580
416
+ },
417
+ {
418
+ "epoch": 0.944,
419
+ "grad_norm": 0.07746365666389465,
420
+ "learning_rate": 4.5074679943101e-05,
421
+ "loss": 0.2303,
422
+ "step": 590
423
+ },
424
+ {
425
+ "epoch": 0.96,
426
+ "grad_norm": 0.004527238663285971,
427
+ "learning_rate": 4.489687055476529e-05,
428
+ "loss": 0.0665,
429
+ "step": 600
430
+ },
431
+ {
432
+ "epoch": 0.976,
433
+ "grad_norm": 4.120000839233398,
434
+ "learning_rate": 4.471906116642959e-05,
435
+ "loss": 0.0546,
436
+ "step": 610
437
+ },
438
+ {
439
+ "epoch": 0.992,
440
+ "grad_norm": 0.0016751989023759961,
441
+ "learning_rate": 4.4541251778093884e-05,
442
+ "loss": 0.0051,
443
+ "step": 620
444
+ },
445
+ {
446
+ "epoch": 1.0,
447
+ "eval_accuracy": 0.9939879759519038,
448
+ "eval_f1_macro": 0.9861065168559803,
449
+ "eval_f1_micro": 0.9939879759519038,
450
+ "eval_f1_weighted": 0.9939373007372426,
451
+ "eval_loss": 0.015998151153326035,
452
+ "eval_precision_macro": 0.9959514170040485,
453
+ "eval_precision_micro": 0.9939879759519038,
454
+ "eval_precision_weighted": 0.994060996486901,
455
+ "eval_recall_macro": 0.9770065284178188,
456
+ "eval_recall_micro": 0.9939879759519038,
457
+ "eval_recall_weighted": 0.9939879759519038,
458
+ "eval_runtime": 10.7994,
459
+ "eval_samples_per_second": 46.206,
460
+ "eval_steps_per_second": 2.963,
461
+ "step": 625
462
+ },
463
+ {
464
+ "epoch": 1.008,
465
+ "grad_norm": 4.363436222076416,
466
+ "learning_rate": 4.436344238975818e-05,
467
+ "loss": 0.0119,
468
+ "step": 630
469
+ },
470
+ {
471
+ "epoch": 1.024,
472
+ "grad_norm": 0.002567918971180916,
473
+ "learning_rate": 4.4185633001422476e-05,
474
+ "loss": 0.0736,
475
+ "step": 640
476
+ },
477
+ {
478
+ "epoch": 1.04,
479
+ "grad_norm": 0.00732471002265811,
480
+ "learning_rate": 4.4007823613086776e-05,
481
+ "loss": 0.0001,
482
+ "step": 650
483
+ },
484
+ {
485
+ "epoch": 1.056,
486
+ "grad_norm": 0.002863505156710744,
487
+ "learning_rate": 4.383001422475107e-05,
488
+ "loss": 0.0608,
489
+ "step": 660
490
+ },
491
+ {
492
+ "epoch": 1.072,
493
+ "grad_norm": 0.008520668372511864,
494
+ "learning_rate": 4.365220483641537e-05,
495
+ "loss": 0.0125,
496
+ "step": 670
497
+ },
498
+ {
499
+ "epoch": 1.088,
500
+ "grad_norm": 0.001538406009785831,
501
+ "learning_rate": 4.347439544807966e-05,
502
+ "loss": 0.0653,
503
+ "step": 680
504
+ },
505
+ {
506
+ "epoch": 1.104,
507
+ "grad_norm": 0.001671103062108159,
508
+ "learning_rate": 4.3296586059743954e-05,
509
+ "loss": 0.081,
510
+ "step": 690
511
+ },
512
+ {
513
+ "epoch": 1.12,
514
+ "grad_norm": 0.20056261122226715,
515
+ "learning_rate": 4.3118776671408254e-05,
516
+ "loss": 0.0057,
517
+ "step": 700
518
+ },
519
+ {
520
+ "epoch": 1.1360000000000001,
521
+ "grad_norm": 0.008405996486544609,
522
+ "learning_rate": 4.294096728307255e-05,
523
+ "loss": 0.0091,
524
+ "step": 710
525
+ },
526
+ {
527
+ "epoch": 1.152,
528
+ "grad_norm": 0.19218170642852783,
529
+ "learning_rate": 4.2763157894736847e-05,
530
+ "loss": 0.0019,
531
+ "step": 720
532
+ },
533
+ {
534
+ "epoch": 1.168,
535
+ "grad_norm": 0.07524432241916656,
536
+ "learning_rate": 4.258534850640114e-05,
537
+ "loss": 0.0952,
538
+ "step": 730
539
+ },
540
+ {
541
+ "epoch": 1.184,
542
+ "grad_norm": 0.0015570322284474969,
543
+ "learning_rate": 4.240753911806544e-05,
544
+ "loss": 0.0435,
545
+ "step": 740
546
+ },
547
+ {
548
+ "epoch": 1.2,
549
+ "grad_norm": 0.0036333040334284306,
550
+ "learning_rate": 4.222972972972973e-05,
551
+ "loss": 0.0207,
552
+ "step": 750
553
+ },
554
+ {
555
+ "epoch": 1.216,
556
+ "grad_norm": 9.78964900970459,
557
+ "learning_rate": 4.205192034139403e-05,
558
+ "loss": 0.1004,
559
+ "step": 760
560
+ },
561
+ {
562
+ "epoch": 1.232,
563
+ "grad_norm": 0.0014888375299051404,
564
+ "learning_rate": 4.187411095305832e-05,
565
+ "loss": 0.0671,
566
+ "step": 770
567
+ },
568
+ {
569
+ "epoch": 1.248,
570
+ "grad_norm": 0.023692140355706215,
571
+ "learning_rate": 4.169630156472262e-05,
572
+ "loss": 0.0026,
573
+ "step": 780
574
+ },
575
+ {
576
+ "epoch": 1.264,
577
+ "grad_norm": 0.004593148361891508,
578
+ "learning_rate": 4.151849217638692e-05,
579
+ "loss": 0.0423,
580
+ "step": 790
581
+ },
582
+ {
583
+ "epoch": 1.28,
584
+ "grad_norm": 0.003923716489225626,
585
+ "learning_rate": 4.134068278805121e-05,
586
+ "loss": 0.0236,
587
+ "step": 800
588
+ },
589
+ {
590
+ "epoch": 1.296,
591
+ "grad_norm": 0.002044517546892166,
592
+ "learning_rate": 4.116287339971551e-05,
593
+ "loss": 0.005,
594
+ "step": 810
595
+ },
596
+ {
597
+ "epoch": 1.312,
598
+ "grad_norm": 0.022982032969594002,
599
+ "learning_rate": 4.09850640113798e-05,
600
+ "loss": 0.0697,
601
+ "step": 820
602
+ },
603
+ {
604
+ "epoch": 1.328,
605
+ "grad_norm": 0.23058412969112396,
606
+ "learning_rate": 4.08072546230441e-05,
607
+ "loss": 0.1222,
608
+ "step": 830
609
+ },
610
+ {
611
+ "epoch": 1.3439999999999999,
612
+ "grad_norm": 0.0017761716153472662,
613
+ "learning_rate": 4.0629445234708395e-05,
614
+ "loss": 0.0022,
615
+ "step": 840
616
+ },
617
+ {
618
+ "epoch": 1.3599999999999999,
619
+ "grad_norm": 0.12850730121135712,
620
+ "learning_rate": 4.0451635846372695e-05,
621
+ "loss": 0.0189,
622
+ "step": 850
623
+ },
624
+ {
625
+ "epoch": 1.376,
626
+ "grad_norm": 0.0014099746476858854,
627
+ "learning_rate": 4.027382645803698e-05,
628
+ "loss": 0.0001,
629
+ "step": 860
630
+ },
631
+ {
632
+ "epoch": 1.392,
633
+ "grad_norm": 10.44194507598877,
634
+ "learning_rate": 4.009601706970128e-05,
635
+ "loss": 0.1423,
636
+ "step": 870
637
+ },
638
+ {
639
+ "epoch": 1.408,
640
+ "grad_norm": 5.446048736572266,
641
+ "learning_rate": 3.9918207681365574e-05,
642
+ "loss": 0.1013,
643
+ "step": 880
644
+ },
645
+ {
646
+ "epoch": 1.424,
647
+ "grad_norm": 0.017437923699617386,
648
+ "learning_rate": 3.9740398293029873e-05,
649
+ "loss": 0.0002,
650
+ "step": 890
651
+ },
652
+ {
653
+ "epoch": 1.44,
654
+ "grad_norm": 0.007926377467811108,
655
+ "learning_rate": 3.956258890469417e-05,
656
+ "loss": 0.0661,
657
+ "step": 900
658
+ },
659
+ {
660
+ "epoch": 1.456,
661
+ "grad_norm": 0.0029465279076248407,
662
+ "learning_rate": 3.9384779516358466e-05,
663
+ "loss": 0.0338,
664
+ "step": 910
665
+ },
666
+ {
667
+ "epoch": 1.472,
668
+ "grad_norm": 0.0010107713751494884,
669
+ "learning_rate": 3.9206970128022766e-05,
670
+ "loss": 0.0171,
671
+ "step": 920
672
+ },
673
+ {
674
+ "epoch": 1.488,
675
+ "grad_norm": 0.036259789019823074,
676
+ "learning_rate": 3.902916073968706e-05,
677
+ "loss": 0.0004,
678
+ "step": 930
679
+ },
680
+ {
681
+ "epoch": 1.504,
682
+ "grad_norm": 0.7286566495895386,
683
+ "learning_rate": 3.885135135135135e-05,
684
+ "loss": 0.014,
685
+ "step": 940
686
+ },
687
+ {
688
+ "epoch": 1.52,
689
+ "grad_norm": 0.008453834801912308,
690
+ "learning_rate": 3.8673541963015645e-05,
691
+ "loss": 0.0002,
692
+ "step": 950
693
+ },
694
+ {
695
+ "epoch": 1.536,
696
+ "grad_norm": 0.1271582841873169,
697
+ "learning_rate": 3.8495732574679944e-05,
698
+ "loss": 0.0043,
699
+ "step": 960
700
+ },
701
+ {
702
+ "epoch": 1.552,
703
+ "grad_norm": 6.197739124298096,
704
+ "learning_rate": 3.831792318634424e-05,
705
+ "loss": 0.0075,
706
+ "step": 970
707
+ },
708
+ {
709
+ "epoch": 1.568,
710
+ "grad_norm": 0.008371386677026749,
711
+ "learning_rate": 3.814011379800854e-05,
712
+ "loss": 0.0299,
713
+ "step": 980
714
+ },
715
+ {
716
+ "epoch": 1.584,
717
+ "grad_norm": 0.0011657042196020484,
718
+ "learning_rate": 3.796230440967283e-05,
719
+ "loss": 0.1188,
720
+ "step": 990
721
+ },
722
+ {
723
+ "epoch": 1.6,
724
+ "grad_norm": 0.010952652432024479,
725
+ "learning_rate": 3.778449502133713e-05,
726
+ "loss": 0.0116,
727
+ "step": 1000
728
+ },
729
+ {
730
+ "epoch": 1.616,
731
+ "grad_norm": 0.0010061347857117653,
732
+ "learning_rate": 3.760668563300143e-05,
733
+ "loss": 0.0043,
734
+ "step": 1010
735
+ },
736
+ {
737
+ "epoch": 1.6320000000000001,
738
+ "grad_norm": 0.06395132839679718,
739
+ "learning_rate": 3.742887624466572e-05,
740
+ "loss": 0.0445,
741
+ "step": 1020
742
+ },
743
+ {
744
+ "epoch": 1.6480000000000001,
745
+ "grad_norm": 0.001108819618821144,
746
+ "learning_rate": 3.7251066856330015e-05,
747
+ "loss": 0.0323,
748
+ "step": 1030
749
+ },
750
+ {
751
+ "epoch": 1.6640000000000001,
752
+ "grad_norm": 0.0014280881732702255,
753
+ "learning_rate": 3.707325746799431e-05,
754
+ "loss": 0.0008,
755
+ "step": 1040
756
+ },
757
+ {
758
+ "epoch": 1.6800000000000002,
759
+ "grad_norm": 0.0012123563792556524,
760
+ "learning_rate": 3.689544807965861e-05,
761
+ "loss": 0.0014,
762
+ "step": 1050
763
+ },
764
+ {
765
+ "epoch": 1.696,
766
+ "grad_norm": 0.0011878483928740025,
767
+ "learning_rate": 3.67176386913229e-05,
768
+ "loss": 0.0524,
769
+ "step": 1060
770
+ },
771
+ {
772
+ "epoch": 1.712,
773
+ "grad_norm": 0.0037567925173789263,
774
+ "learning_rate": 3.65398293029872e-05,
775
+ "loss": 0.0237,
776
+ "step": 1070
777
+ },
778
+ {
779
+ "epoch": 1.728,
780
+ "grad_norm": 0.002096337964758277,
781
+ "learning_rate": 3.636201991465149e-05,
782
+ "loss": 0.0334,
783
+ "step": 1080
784
+ },
785
+ {
786
+ "epoch": 1.744,
787
+ "grad_norm": 8.111098289489746,
788
+ "learning_rate": 3.618421052631579e-05,
789
+ "loss": 0.1467,
790
+ "step": 1090
791
+ },
792
+ {
793
+ "epoch": 1.76,
794
+ "grad_norm": 0.0023632964584976435,
795
+ "learning_rate": 3.600640113798009e-05,
796
+ "loss": 0.0003,
797
+ "step": 1100
798
+ },
799
+ {
800
+ "epoch": 1.776,
801
+ "grad_norm": 9.071378707885742,
802
+ "learning_rate": 3.5828591749644385e-05,
803
+ "loss": 0.0176,
804
+ "step": 1110
805
+ },
806
+ {
807
+ "epoch": 1.792,
808
+ "grad_norm": 0.08811729401350021,
809
+ "learning_rate": 3.565078236130868e-05,
810
+ "loss": 0.1454,
811
+ "step": 1120
812
+ },
813
+ {
814
+ "epoch": 1.808,
815
+ "grad_norm": 0.0101834237575531,
816
+ "learning_rate": 3.547297297297297e-05,
817
+ "loss": 0.0172,
818
+ "step": 1130
819
+ },
820
+ {
821
+ "epoch": 1.8239999999999998,
822
+ "grad_norm": 0.1676841378211975,
823
+ "learning_rate": 3.529516358463727e-05,
824
+ "loss": 0.049,
825
+ "step": 1140
826
+ },
827
+ {
828
+ "epoch": 1.8399999999999999,
829
+ "grad_norm": 0.043762240558862686,
830
+ "learning_rate": 3.5117354196301564e-05,
831
+ "loss": 0.0432,
832
+ "step": 1150
833
+ },
834
+ {
835
+ "epoch": 1.8559999999999999,
836
+ "grad_norm": 0.011058449745178223,
837
+ "learning_rate": 3.4939544807965863e-05,
838
+ "loss": 0.001,
839
+ "step": 1160
840
+ },
841
+ {
842
+ "epoch": 1.8719999999999999,
843
+ "grad_norm": 0.0018902173032984138,
844
+ "learning_rate": 3.4761735419630156e-05,
845
+ "loss": 0.0399,
846
+ "step": 1170
847
+ },
848
+ {
849
+ "epoch": 1.888,
850
+ "grad_norm": 0.001800389145500958,
851
+ "learning_rate": 3.4583926031294456e-05,
852
+ "loss": 0.0184,
853
+ "step": 1180
854
+ },
855
+ {
856
+ "epoch": 1.904,
857
+ "grad_norm": 0.06505569815635681,
858
+ "learning_rate": 3.440611664295875e-05,
859
+ "loss": 0.0073,
860
+ "step": 1190
861
+ },
862
+ {
863
+ "epoch": 1.92,
864
+ "grad_norm": 0.5011488199234009,
865
+ "learning_rate": 3.422830725462304e-05,
866
+ "loss": 0.0007,
867
+ "step": 1200
868
+ },
869
+ {
870
+ "epoch": 1.936,
871
+ "grad_norm": 0.013254979625344276,
872
+ "learning_rate": 3.405049786628734e-05,
873
+ "loss": 0.0059,
874
+ "step": 1210
875
+ },
876
+ {
877
+ "epoch": 1.952,
878
+ "grad_norm": 0.7582728266716003,
879
+ "learning_rate": 3.3872688477951634e-05,
880
+ "loss": 0.0013,
881
+ "step": 1220
882
+ },
883
+ {
884
+ "epoch": 1.968,
885
+ "grad_norm": 0.009909105487167835,
886
+ "learning_rate": 3.3694879089615934e-05,
887
+ "loss": 0.0002,
888
+ "step": 1230
889
+ },
890
+ {
891
+ "epoch": 1.984,
892
+ "grad_norm": 0.000718823226634413,
893
+ "learning_rate": 3.351706970128023e-05,
894
+ "loss": 0.0047,
895
+ "step": 1240
896
+ },
897
+ {
898
+ "epoch": 2.0,
899
+ "grad_norm": 0.6642739772796631,
900
+ "learning_rate": 3.333926031294453e-05,
901
+ "loss": 0.0015,
902
+ "step": 1250
903
+ },
904
+ {
905
+ "epoch": 2.0,
906
+ "eval_accuracy": 0.9919839679358717,
907
+ "eval_f1_macro": 0.9810672282141238,
908
+ "eval_f1_micro": 0.9919839679358717,
909
+ "eval_f1_weighted": 0.992107547860045,
910
+ "eval_loss": 0.019453825429081917,
911
+ "eval_precision_macro": 0.9692164931816348,
912
+ "eval_precision_micro": 0.9919839679358717,
913
+ "eval_precision_weighted": 0.9925061936994298,
914
+ "eval_recall_macro": 0.9942915690866512,
915
+ "eval_recall_micro": 0.9919839679358717,
916
+ "eval_recall_weighted": 0.9919839679358717,
917
+ "eval_runtime": 10.636,
918
+ "eval_samples_per_second": 46.916,
919
+ "eval_steps_per_second": 3.009,
920
+ "step": 1250
921
+ },
922
+ {
923
+ "epoch": 2.016,
924
+ "grad_norm": 0.005395730957388878,
925
+ "learning_rate": 3.316145092460882e-05,
926
+ "loss": 0.0001,
927
+ "step": 1260
928
+ },
929
+ {
930
+ "epoch": 2.032,
931
+ "grad_norm": 0.0008304046932607889,
932
+ "learning_rate": 3.298364153627312e-05,
933
+ "loss": 0.0001,
934
+ "step": 1270
935
+ },
936
+ {
937
+ "epoch": 2.048,
938
+ "grad_norm": 0.0007791437674313784,
939
+ "learning_rate": 3.280583214793741e-05,
940
+ "loss": 0.0001,
941
+ "step": 1280
942
+ },
943
+ {
944
+ "epoch": 2.064,
945
+ "grad_norm": 0.0008820474613457918,
946
+ "learning_rate": 3.2628022759601705e-05,
947
+ "loss": 0.0001,
948
+ "step": 1290
949
+ },
950
+ {
951
+ "epoch": 2.08,
952
+ "grad_norm": 0.000594152370467782,
953
+ "learning_rate": 3.2450213371266005e-05,
954
+ "loss": 0.0002,
955
+ "step": 1300
956
+ },
957
+ {
958
+ "epoch": 2.096,
959
+ "grad_norm": 0.0006243674433790147,
960
+ "learning_rate": 3.22724039829303e-05,
961
+ "loss": 0.0022,
962
+ "step": 1310
963
+ },
964
+ {
965
+ "epoch": 2.112,
966
+ "grad_norm": 0.0007709413184784353,
967
+ "learning_rate": 3.20945945945946e-05,
968
+ "loss": 0.0154,
969
+ "step": 1320
970
+ },
971
+ {
972
+ "epoch": 2.128,
973
+ "grad_norm": 0.004626740701496601,
974
+ "learning_rate": 3.191678520625889e-05,
975
+ "loss": 0.0012,
976
+ "step": 1330
977
+ },
978
+ {
979
+ "epoch": 2.144,
980
+ "grad_norm": 0.002739989897236228,
981
+ "learning_rate": 3.173897581792319e-05,
982
+ "loss": 0.0004,
983
+ "step": 1340
984
+ },
985
+ {
986
+ "epoch": 2.16,
987
+ "grad_norm": 0.007507917005568743,
988
+ "learning_rate": 3.156116642958748e-05,
989
+ "loss": 0.0001,
990
+ "step": 1350
991
+ },
992
+ {
993
+ "epoch": 2.176,
994
+ "grad_norm": 0.004014975391328335,
995
+ "learning_rate": 3.138335704125178e-05,
996
+ "loss": 0.001,
997
+ "step": 1360
998
+ },
999
+ {
1000
+ "epoch": 2.192,
1001
+ "grad_norm": 0.000455196452094242,
1002
+ "learning_rate": 3.1205547652916076e-05,
1003
+ "loss": 0.0005,
1004
+ "step": 1370
1005
+ },
1006
+ {
1007
+ "epoch": 2.208,
1008
+ "grad_norm": 0.00030664558289572597,
1009
+ "learning_rate": 3.102773826458037e-05,
1010
+ "loss": 0.0265,
1011
+ "step": 1380
1012
+ },
1013
+ {
1014
+ "epoch": 2.224,
1015
+ "grad_norm": 0.00043227567221038043,
1016
+ "learning_rate": 3.084992887624467e-05,
1017
+ "loss": 0.0001,
1018
+ "step": 1390
1019
+ },
1020
+ {
1021
+ "epoch": 2.24,
1022
+ "grad_norm": 0.0481143593788147,
1023
+ "learning_rate": 3.067211948790896e-05,
1024
+ "loss": 0.0023,
1025
+ "step": 1400
1026
+ },
1027
+ {
1028
+ "epoch": 2.2560000000000002,
1029
+ "grad_norm": 0.005968974903225899,
1030
+ "learning_rate": 3.0494310099573257e-05,
1031
+ "loss": 0.0025,
1032
+ "step": 1410
1033
+ },
1034
+ {
1035
+ "epoch": 2.2720000000000002,
1036
+ "grad_norm": 0.21466164290905,
1037
+ "learning_rate": 3.0316500711237557e-05,
1038
+ "loss": 0.0003,
1039
+ "step": 1420
1040
+ },
1041
+ {
1042
+ "epoch": 2.288,
1043
+ "grad_norm": 0.0018357799854129553,
1044
+ "learning_rate": 3.0138691322901853e-05,
1045
+ "loss": 0.0001,
1046
+ "step": 1430
1047
+ },
1048
+ {
1049
+ "epoch": 2.304,
1050
+ "grad_norm": 0.0004517412162385881,
1051
+ "learning_rate": 2.996088193456615e-05,
1052
+ "loss": 0.0001,
1053
+ "step": 1440
1054
+ },
1055
+ {
1056
+ "epoch": 2.32,
1057
+ "grad_norm": 0.005384071730077267,
1058
+ "learning_rate": 2.9783072546230446e-05,
1059
+ "loss": 0.002,
1060
+ "step": 1450
1061
+ },
1062
+ {
1063
+ "epoch": 2.336,
1064
+ "grad_norm": 0.00046941509936004877,
1065
+ "learning_rate": 2.9605263157894735e-05,
1066
+ "loss": 0.0001,
1067
+ "step": 1460
1068
+ },
1069
+ {
1070
+ "epoch": 2.352,
1071
+ "grad_norm": 0.011709867045283318,
1072
+ "learning_rate": 2.9427453769559032e-05,
1073
+ "loss": 0.0001,
1074
+ "step": 1470
1075
+ },
1076
+ {
1077
+ "epoch": 2.368,
1078
+ "grad_norm": 0.03616934269666672,
1079
+ "learning_rate": 2.9249644381223328e-05,
1080
+ "loss": 0.0008,
1081
+ "step": 1480
1082
+ },
1083
+ {
1084
+ "epoch": 2.384,
1085
+ "grad_norm": 0.0006385542219504714,
1086
+ "learning_rate": 2.9071834992887624e-05,
1087
+ "loss": 0.0001,
1088
+ "step": 1490
1089
+ },
1090
+ {
1091
+ "epoch": 2.4,
1092
+ "grad_norm": 1.0423965454101562,
1093
+ "learning_rate": 2.889402560455192e-05,
1094
+ "loss": 0.0021,
1095
+ "step": 1500
1096
+ },
1097
+ {
1098
+ "epoch": 2.416,
1099
+ "grad_norm": 0.21064622700214386,
1100
+ "learning_rate": 2.8716216216216217e-05,
1101
+ "loss": 0.0003,
1102
+ "step": 1510
1103
+ },
1104
+ {
1105
+ "epoch": 2.432,
1106
+ "grad_norm": 0.0016910170670598745,
1107
+ "learning_rate": 2.8538406827880517e-05,
1108
+ "loss": 0.0001,
1109
+ "step": 1520
1110
+ },
1111
+ {
1112
+ "epoch": 2.448,
1113
+ "grad_norm": 0.0004065225657541305,
1114
+ "learning_rate": 2.8360597439544813e-05,
1115
+ "loss": 0.0004,
1116
+ "step": 1530
1117
+ },
1118
+ {
1119
+ "epoch": 2.464,
1120
+ "grad_norm": 0.0006998078897595406,
1121
+ "learning_rate": 2.8182788051209103e-05,
1122
+ "loss": 0.0001,
1123
+ "step": 1540
1124
+ },
1125
+ {
1126
+ "epoch": 2.48,
1127
+ "grad_norm": 0.0057923863641917706,
1128
+ "learning_rate": 2.80049786628734e-05,
1129
+ "loss": 0.0012,
1130
+ "step": 1550
1131
+ },
1132
+ {
1133
+ "epoch": 2.496,
1134
+ "grad_norm": 0.00030601295293308794,
1135
+ "learning_rate": 2.7827169274537695e-05,
1136
+ "loss": 0.0236,
1137
+ "step": 1560
1138
+ },
1139
+ {
1140
+ "epoch": 2.512,
1141
+ "grad_norm": 0.00045617681462317705,
1142
+ "learning_rate": 2.764935988620199e-05,
1143
+ "loss": 0.0001,
1144
+ "step": 1570
1145
+ },
1146
+ {
1147
+ "epoch": 2.528,
1148
+ "grad_norm": 0.00044945545960217714,
1149
+ "learning_rate": 2.7471550497866288e-05,
1150
+ "loss": 0.0001,
1151
+ "step": 1580
1152
+ },
1153
+ {
1154
+ "epoch": 2.544,
1155
+ "grad_norm": 0.0003521046892274171,
1156
+ "learning_rate": 2.7293741109530584e-05,
1157
+ "loss": 0.0263,
1158
+ "step": 1590
1159
+ },
1160
+ {
1161
+ "epoch": 2.56,
1162
+ "grad_norm": 0.008946732617914677,
1163
+ "learning_rate": 2.711593172119488e-05,
1164
+ "loss": 0.0006,
1165
+ "step": 1600
1166
+ },
1167
+ {
1168
+ "epoch": 2.576,
1169
+ "grad_norm": 0.09531024098396301,
1170
+ "learning_rate": 2.6938122332859177e-05,
1171
+ "loss": 0.0003,
1172
+ "step": 1610
1173
+ },
1174
+ {
1175
+ "epoch": 2.592,
1176
+ "grad_norm": 0.0010131685994565487,
1177
+ "learning_rate": 2.6760312944523473e-05,
1178
+ "loss": 0.0001,
1179
+ "step": 1620
1180
+ },
1181
+ {
1182
+ "epoch": 2.608,
1183
+ "grad_norm": 0.0016723967855796218,
1184
+ "learning_rate": 2.6582503556187766e-05,
1185
+ "loss": 0.0018,
1186
+ "step": 1630
1187
+ },
1188
+ {
1189
+ "epoch": 2.624,
1190
+ "grad_norm": 0.008407480083405972,
1191
+ "learning_rate": 2.6404694167852062e-05,
1192
+ "loss": 0.0344,
1193
+ "step": 1640
1194
+ },
1195
+ {
1196
+ "epoch": 2.64,
1197
+ "grad_norm": 0.001671052654273808,
1198
+ "learning_rate": 2.622688477951636e-05,
1199
+ "loss": 0.0005,
1200
+ "step": 1650
1201
+ },
1202
+ {
1203
+ "epoch": 2.656,
1204
+ "grad_norm": 0.0003514339041430503,
1205
+ "learning_rate": 2.6049075391180655e-05,
1206
+ "loss": 0.0001,
1207
+ "step": 1660
1208
+ },
1209
+ {
1210
+ "epoch": 2.672,
1211
+ "grad_norm": 0.00044292688835412264,
1212
+ "learning_rate": 2.587126600284495e-05,
1213
+ "loss": 0.0017,
1214
+ "step": 1670
1215
+ },
1216
+ {
1217
+ "epoch": 2.6879999999999997,
1218
+ "grad_norm": 0.00033179231104440987,
1219
+ "learning_rate": 2.5693456614509247e-05,
1220
+ "loss": 0.0243,
1221
+ "step": 1680
1222
+ },
1223
+ {
1224
+ "epoch": 2.7039999999999997,
1225
+ "grad_norm": 0.0023233199026435614,
1226
+ "learning_rate": 2.5515647226173544e-05,
1227
+ "loss": 0.0001,
1228
+ "step": 1690
1229
+ },
1230
+ {
1231
+ "epoch": 2.7199999999999998,
1232
+ "grad_norm": 0.0012358427047729492,
1233
+ "learning_rate": 2.533783783783784e-05,
1234
+ "loss": 0.0002,
1235
+ "step": 1700
1236
+ },
1237
+ {
1238
+ "epoch": 2.7359999999999998,
1239
+ "grad_norm": 0.00035215960815548897,
1240
+ "learning_rate": 2.5160028449502136e-05,
1241
+ "loss": 0.0235,
1242
+ "step": 1710
1243
+ },
1244
+ {
1245
+ "epoch": 2.752,
1246
+ "grad_norm": 0.24424146115779877,
1247
+ "learning_rate": 2.4982219061166433e-05,
1248
+ "loss": 0.0005,
1249
+ "step": 1720
1250
+ },
1251
+ {
1252
+ "epoch": 2.768,
1253
+ "grad_norm": 2.014695167541504,
1254
+ "learning_rate": 2.480440967283073e-05,
1255
+ "loss": 0.0111,
1256
+ "step": 1730
1257
+ },
1258
+ {
1259
+ "epoch": 2.784,
1260
+ "grad_norm": 0.00039951372309587896,
1261
+ "learning_rate": 2.4626600284495022e-05,
1262
+ "loss": 0.0004,
1263
+ "step": 1740
1264
+ },
1265
+ {
1266
+ "epoch": 2.8,
1267
+ "grad_norm": 0.0016597098438069224,
1268
+ "learning_rate": 2.4448790896159318e-05,
1269
+ "loss": 0.0002,
1270
+ "step": 1750
1271
+ },
1272
+ {
1273
+ "epoch": 2.816,
1274
+ "grad_norm": 0.010242385789752007,
1275
+ "learning_rate": 2.4270981507823614e-05,
1276
+ "loss": 0.0001,
1277
+ "step": 1760
1278
+ },
1279
+ {
1280
+ "epoch": 2.832,
1281
+ "grad_norm": 0.0023806928656995296,
1282
+ "learning_rate": 2.409317211948791e-05,
1283
+ "loss": 0.0001,
1284
+ "step": 1770
1285
+ },
1286
+ {
1287
+ "epoch": 2.848,
1288
+ "grad_norm": 0.01970355026423931,
1289
+ "learning_rate": 2.3915362731152204e-05,
1290
+ "loss": 0.0183,
1291
+ "step": 1780
1292
+ },
1293
+ {
1294
+ "epoch": 2.864,
1295
+ "grad_norm": 0.0002463227428961545,
1296
+ "learning_rate": 2.37375533428165e-05,
1297
+ "loss": 0.0,
1298
+ "step": 1790
1299
+ },
1300
+ {
1301
+ "epoch": 2.88,
1302
+ "grad_norm": 0.000822290952783078,
1303
+ "learning_rate": 2.35597439544808e-05,
1304
+ "loss": 0.0001,
1305
+ "step": 1800
1306
+ },
1307
+ {
1308
+ "epoch": 2.896,
1309
+ "grad_norm": 0.0026505696587264538,
1310
+ "learning_rate": 2.3381934566145096e-05,
1311
+ "loss": 0.0004,
1312
+ "step": 1810
1313
+ },
1314
+ {
1315
+ "epoch": 2.912,
1316
+ "grad_norm": 0.0005039023817516863,
1317
+ "learning_rate": 2.320412517780939e-05,
1318
+ "loss": 0.0001,
1319
+ "step": 1820
1320
+ },
1321
+ {
1322
+ "epoch": 2.928,
1323
+ "grad_norm": 0.009028232656419277,
1324
+ "learning_rate": 2.3026315789473685e-05,
1325
+ "loss": 0.0001,
1326
+ "step": 1830
1327
+ },
1328
+ {
1329
+ "epoch": 2.944,
1330
+ "grad_norm": 0.002705842722207308,
1331
+ "learning_rate": 2.284850640113798e-05,
1332
+ "loss": 0.0003,
1333
+ "step": 1840
1334
+ },
1335
+ {
1336
+ "epoch": 2.96,
1337
+ "grad_norm": 0.00383372837677598,
1338
+ "learning_rate": 2.2670697012802278e-05,
1339
+ "loss": 0.0017,
1340
+ "step": 1850
1341
+ },
1342
+ {
1343
+ "epoch": 2.976,
1344
+ "grad_norm": 0.00024894202942959964,
1345
+ "learning_rate": 2.2492887624466574e-05,
1346
+ "loss": 0.0002,
1347
+ "step": 1860
1348
+ },
1349
+ {
1350
+ "epoch": 2.992,
1351
+ "grad_norm": 0.0009176091407425702,
1352
+ "learning_rate": 2.2315078236130867e-05,
1353
+ "loss": 0.0,
1354
+ "step": 1870
1355
+ },
1356
+ {
1357
+ "epoch": 3.0,
1358
+ "eval_accuracy": 0.9939879759519038,
1359
+ "eval_f1_macro": 0.9910040909369707,
1360
+ "eval_f1_micro": 0.9939879759519038,
1361
+ "eval_f1_weighted": 0.9939809297459037,
1362
+ "eval_loss": 0.014622284099459648,
1363
+ "eval_precision_macro": 0.9959514170040485,
1364
+ "eval_precision_micro": 0.9939879759519038,
1365
+ "eval_precision_weighted": 0.994060996486901,
1366
+ "eval_recall_macro": 0.9862711213517666,
1367
+ "eval_recall_micro": 0.9939879759519038,
1368
+ "eval_recall_weighted": 0.9939879759519038,
1369
+ "eval_runtime": 10.5217,
1370
+ "eval_samples_per_second": 47.426,
1371
+ "eval_steps_per_second": 3.041,
1372
+ "step": 1875
1373
+ }
1374
+ ],
1375
+ "logging_steps": 10,
1376
+ "max_steps": 3125,
1377
+ "num_input_tokens_seen": 0,
1378
+ "num_train_epochs": 5,
1379
+ "save_steps": 500,
1380
+ "stateful_callbacks": {
1381
+ "EarlyStoppingCallback": {
1382
+ "args": {
1383
+ "early_stopping_patience": 5,
1384
+ "early_stopping_threshold": 0.01
1385
+ },
1386
+ "attributes": {
1387
+ "early_stopping_patience_counter": 2
1388
+ }
1389
+ },
1390
+ "TrainerControl": {
1391
+ "args": {
1392
+ "should_epoch_stop": false,
1393
+ "should_evaluate": false,
1394
+ "should_log": false,
1395
+ "should_save": true,
1396
+ "should_training_stop": false
1397
+ },
1398
+ "attributes": {}
1399
+ }
1400
+ },
1401
+ "total_flos": 2446967439360000.0,
1402
+ "train_batch_size": 8,
1403
+ "trial_name": null,
1404
+ "trial_params": null
1405
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ea603352d747c992629b055015f8d059765cdd964379a0825a7adeea2eb9304
3
+ size 5777