explorewithai commited on
Commit
b730cdb
·
verified ·
1 Parent(s): fcfeb1f

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
checkpoint-3576/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "FacebookAI/xlm-roberta-base",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 16,
30
+ "output_past": true,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 250002
39
+ }
checkpoint-3576/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:702c4654277aef1f15073d207ced085dca6d4939dfe9e49b9ae49a4c4242703d
3
+ size 1225621956
checkpoint-3576/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3964c92331c3373e4cc30d81611884c4a460d90f761f8b23c9874885e9ec9bcc
3
+ size 2451401082
checkpoint-3576/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d927a08a1939df0a42e6b6a00e09f54423ac03e9ef75e54d8b79a1e1189a213a
3
+ size 14244
checkpoint-3576/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:79255366b727d152370d7b0ccc283bb95912d01c568edf92caf755fd00ffc501
3
+ size 1064
checkpoint-3576/trainer_state.json ADDED
@@ -0,0 +1,611 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8825053995680345,
3
+ "best_model_checkpoint": "./sentiment_model/checkpoint-3129",
4
+ "epoch": 8.0,
5
+ "eval_steps": 500,
6
+ "global_step": 3576,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11185682326621924,
13
+ "grad_norm": 18.68804168701172,
14
+ "learning_rate": 5.3691275167785235e-06,
15
+ "loss": 4.4686,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.22371364653243847,
20
+ "grad_norm": 33.30501937866211,
21
+ "learning_rate": 1.0738255033557047e-05,
22
+ "loss": 4.0139,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.33557046979865773,
27
+ "grad_norm": 48.02063751220703,
28
+ "learning_rate": 1.633109619686801e-05,
29
+ "loss": 2.9933,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.44742729306487694,
34
+ "grad_norm": 35.39257049560547,
35
+ "learning_rate": 2.192393736017897e-05,
36
+ "loss": 2.5147,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.5592841163310962,
41
+ "grad_norm": 39.318363189697266,
42
+ "learning_rate": 2.7404921700223713e-05,
43
+ "loss": 2.4221,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.6711409395973155,
48
+ "grad_norm": 50.962833404541016,
49
+ "learning_rate": 3.2997762863534674e-05,
50
+ "loss": 2.3124,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.7829977628635347,
55
+ "grad_norm": 37.32981491088867,
56
+ "learning_rate": 3.859060402684564e-05,
57
+ "loss": 2.4092,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.8948545861297539,
62
+ "grad_norm": 32.42803955078125,
63
+ "learning_rate": 4.4183445190156604e-05,
64
+ "loss": 2.4183,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 1.0,
69
+ "eval_accuracy": 0.7844492440604751,
70
+ "eval_loss": 0.5040740370750427,
71
+ "eval_runtime": 19.6714,
72
+ "eval_samples_per_second": 117.683,
73
+ "eval_steps_per_second": 3.711,
74
+ "step": 447
75
+ },
76
+ {
77
+ "epoch": 1.0067114093959733,
78
+ "grad_norm": 20.918420791625977,
79
+ "learning_rate": 4.977628635346757e-05,
80
+ "loss": 2.3532,
81
+ "step": 450
82
+ },
83
+ {
84
+ "epoch": 1.1185682326621924,
85
+ "grad_norm": 28.06084442138672,
86
+ "learning_rate": 4.94034302759135e-05,
87
+ "loss": 2.1244,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 1.2304250559284116,
92
+ "grad_norm": 40.768096923828125,
93
+ "learning_rate": 4.8782003479990054e-05,
94
+ "loss": 2.3004,
95
+ "step": 550
96
+ },
97
+ {
98
+ "epoch": 1.342281879194631,
99
+ "grad_norm": 19.545913696289062,
100
+ "learning_rate": 4.816057668406662e-05,
101
+ "loss": 2.2399,
102
+ "step": 600
103
+ },
104
+ {
105
+ "epoch": 1.45413870246085,
106
+ "grad_norm": 69.29216766357422,
107
+ "learning_rate": 4.753914988814318e-05,
108
+ "loss": 2.1458,
109
+ "step": 650
110
+ },
111
+ {
112
+ "epoch": 1.5659955257270695,
113
+ "grad_norm": 29.00174903869629,
114
+ "learning_rate": 4.691772309221974e-05,
115
+ "loss": 2.0778,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 1.6778523489932886,
120
+ "grad_norm": 28.77162742614746,
121
+ "learning_rate": 4.62962962962963e-05,
122
+ "loss": 2.1317,
123
+ "step": 750
124
+ },
125
+ {
126
+ "epoch": 1.7897091722595078,
127
+ "grad_norm": 36.04485321044922,
128
+ "learning_rate": 4.5674869500372856e-05,
129
+ "loss": 2.1345,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 1.901565995525727,
134
+ "grad_norm": 51.31829071044922,
135
+ "learning_rate": 4.505344270444942e-05,
136
+ "loss": 2.2048,
137
+ "step": 850
138
+ },
139
+ {
140
+ "epoch": 2.0,
141
+ "eval_accuracy": 0.8138228941684665,
142
+ "eval_loss": 0.43246370553970337,
143
+ "eval_runtime": 19.9941,
144
+ "eval_samples_per_second": 115.784,
145
+ "eval_steps_per_second": 3.651,
146
+ "step": 894
147
+ },
148
+ {
149
+ "epoch": 2.0134228187919465,
150
+ "grad_norm": 19.70203971862793,
151
+ "learning_rate": 4.443201590852597e-05,
152
+ "loss": 2.0482,
153
+ "step": 900
154
+ },
155
+ {
156
+ "epoch": 2.1252796420581657,
157
+ "grad_norm": 44.575721740722656,
158
+ "learning_rate": 4.381058911260254e-05,
159
+ "loss": 1.8304,
160
+ "step": 950
161
+ },
162
+ {
163
+ "epoch": 2.237136465324385,
164
+ "grad_norm": 48.66209411621094,
165
+ "learning_rate": 4.31891623166791e-05,
166
+ "loss": 1.7784,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 2.348993288590604,
171
+ "grad_norm": 19.984678268432617,
172
+ "learning_rate": 4.256773552075566e-05,
173
+ "loss": 1.8457,
174
+ "step": 1050
175
+ },
176
+ {
177
+ "epoch": 2.460850111856823,
178
+ "grad_norm": 64.9332275390625,
179
+ "learning_rate": 4.194630872483222e-05,
180
+ "loss": 1.7747,
181
+ "step": 1100
182
+ },
183
+ {
184
+ "epoch": 2.5727069351230423,
185
+ "grad_norm": 20.591524124145508,
186
+ "learning_rate": 4.1324881928908774e-05,
187
+ "loss": 1.9082,
188
+ "step": 1150
189
+ },
190
+ {
191
+ "epoch": 2.684563758389262,
192
+ "grad_norm": 24.512027740478516,
193
+ "learning_rate": 4.0703455132985336e-05,
194
+ "loss": 1.8399,
195
+ "step": 1200
196
+ },
197
+ {
198
+ "epoch": 2.796420581655481,
199
+ "grad_norm": 18.300504684448242,
200
+ "learning_rate": 4.00820283370619e-05,
201
+ "loss": 1.7937,
202
+ "step": 1250
203
+ },
204
+ {
205
+ "epoch": 2.9082774049217,
206
+ "grad_norm": 40.87727737426758,
207
+ "learning_rate": 3.946060154113845e-05,
208
+ "loss": 1.8174,
209
+ "step": 1300
210
+ },
211
+ {
212
+ "epoch": 3.0,
213
+ "eval_accuracy": 0.8535637149028078,
214
+ "eval_loss": 0.37263187766075134,
215
+ "eval_runtime": 19.6701,
216
+ "eval_samples_per_second": 117.691,
217
+ "eval_steps_per_second": 3.711,
218
+ "step": 1341
219
+ },
220
+ {
221
+ "epoch": 3.0201342281879193,
222
+ "grad_norm": 29.54434585571289,
223
+ "learning_rate": 3.883917474521502e-05,
224
+ "loss": 1.8262,
225
+ "step": 1350
226
+ },
227
+ {
228
+ "epoch": 3.131991051454139,
229
+ "grad_norm": 19.222951889038086,
230
+ "learning_rate": 3.8217747949291576e-05,
231
+ "loss": 1.5486,
232
+ "step": 1400
233
+ },
234
+ {
235
+ "epoch": 3.243847874720358,
236
+ "grad_norm": 36.32904052734375,
237
+ "learning_rate": 3.759632115336814e-05,
238
+ "loss": 1.5273,
239
+ "step": 1450
240
+ },
241
+ {
242
+ "epoch": 3.3557046979865772,
243
+ "grad_norm": 27.05373764038086,
244
+ "learning_rate": 3.697489435744469e-05,
245
+ "loss": 1.5796,
246
+ "step": 1500
247
+ },
248
+ {
249
+ "epoch": 3.4675615212527964,
250
+ "grad_norm": 50.08060836791992,
251
+ "learning_rate": 3.6353467561521254e-05,
252
+ "loss": 1.5104,
253
+ "step": 1550
254
+ },
255
+ {
256
+ "epoch": 3.5794183445190155,
257
+ "grad_norm": 29.98794174194336,
258
+ "learning_rate": 3.5732040765597816e-05,
259
+ "loss": 1.6298,
260
+ "step": 1600
261
+ },
262
+ {
263
+ "epoch": 3.6912751677852347,
264
+ "grad_norm": 33.08045196533203,
265
+ "learning_rate": 3.511061396967437e-05,
266
+ "loss": 1.4769,
267
+ "step": 1650
268
+ },
269
+ {
270
+ "epoch": 3.8031319910514543,
271
+ "grad_norm": 30.475862503051758,
272
+ "learning_rate": 3.448918717375094e-05,
273
+ "loss": 1.547,
274
+ "step": 1700
275
+ },
276
+ {
277
+ "epoch": 3.9149888143176734,
278
+ "grad_norm": 29.52466583251953,
279
+ "learning_rate": 3.3867760377827495e-05,
280
+ "loss": 1.4918,
281
+ "step": 1750
282
+ },
283
+ {
284
+ "epoch": 4.0,
285
+ "eval_accuracy": 0.8501079913606912,
286
+ "eval_loss": 0.38409608602523804,
287
+ "eval_runtime": 19.6128,
288
+ "eval_samples_per_second": 118.035,
289
+ "eval_steps_per_second": 3.722,
290
+ "step": 1788
291
+ },
292
+ {
293
+ "epoch": 4.026845637583893,
294
+ "grad_norm": 27.849456787109375,
295
+ "learning_rate": 3.3246333581904056e-05,
296
+ "loss": 1.4653,
297
+ "step": 1800
298
+ },
299
+ {
300
+ "epoch": 4.138702460850112,
301
+ "grad_norm": 39.9719123840332,
302
+ "learning_rate": 3.262490678598061e-05,
303
+ "loss": 1.2247,
304
+ "step": 1850
305
+ },
306
+ {
307
+ "epoch": 4.250559284116331,
308
+ "grad_norm": 43.72661590576172,
309
+ "learning_rate": 3.200347999005717e-05,
310
+ "loss": 1.2065,
311
+ "step": 1900
312
+ },
313
+ {
314
+ "epoch": 4.3624161073825505,
315
+ "grad_norm": 29.79233169555664,
316
+ "learning_rate": 3.1382053194133735e-05,
317
+ "loss": 1.2585,
318
+ "step": 1950
319
+ },
320
+ {
321
+ "epoch": 4.47427293064877,
322
+ "grad_norm": 30.871849060058594,
323
+ "learning_rate": 3.076062639821029e-05,
324
+ "loss": 1.2602,
325
+ "step": 2000
326
+ },
327
+ {
328
+ "epoch": 4.586129753914989,
329
+ "grad_norm": 30.835567474365234,
330
+ "learning_rate": 3.0139199602286848e-05,
331
+ "loss": 1.2781,
332
+ "step": 2050
333
+ },
334
+ {
335
+ "epoch": 4.697986577181208,
336
+ "grad_norm": 37.51513671875,
337
+ "learning_rate": 2.9517772806363413e-05,
338
+ "loss": 1.228,
339
+ "step": 2100
340
+ },
341
+ {
342
+ "epoch": 4.809843400447427,
343
+ "grad_norm": 35.483665466308594,
344
+ "learning_rate": 2.889634601043997e-05,
345
+ "loss": 1.2876,
346
+ "step": 2150
347
+ },
348
+ {
349
+ "epoch": 4.921700223713646,
350
+ "grad_norm": 21.087987899780273,
351
+ "learning_rate": 2.8274919214516533e-05,
352
+ "loss": 1.2457,
353
+ "step": 2200
354
+ },
355
+ {
356
+ "epoch": 5.0,
357
+ "eval_accuracy": 0.8660907127429806,
358
+ "eval_loss": 0.3854082524776459,
359
+ "eval_runtime": 19.9697,
360
+ "eval_samples_per_second": 115.926,
361
+ "eval_steps_per_second": 3.656,
362
+ "step": 2235
363
+ },
364
+ {
365
+ "epoch": 5.033557046979865,
366
+ "grad_norm": 32.84519577026367,
367
+ "learning_rate": 2.765349241859309e-05,
368
+ "loss": 1.1247,
369
+ "step": 2250
370
+ },
371
+ {
372
+ "epoch": 5.145413870246085,
373
+ "grad_norm": 42.308837890625,
374
+ "learning_rate": 2.703206562266965e-05,
375
+ "loss": 0.9676,
376
+ "step": 2300
377
+ },
378
+ {
379
+ "epoch": 5.257270693512305,
380
+ "grad_norm": 32.68018341064453,
381
+ "learning_rate": 2.6410638826746208e-05,
382
+ "loss": 0.9049,
383
+ "step": 2350
384
+ },
385
+ {
386
+ "epoch": 5.369127516778524,
387
+ "grad_norm": 40.90736770629883,
388
+ "learning_rate": 2.5789212030822766e-05,
389
+ "loss": 0.8973,
390
+ "step": 2400
391
+ },
392
+ {
393
+ "epoch": 5.480984340044743,
394
+ "grad_norm": 42.41788864135742,
395
+ "learning_rate": 2.516778523489933e-05,
396
+ "loss": 0.972,
397
+ "step": 2450
398
+ },
399
+ {
400
+ "epoch": 5.592841163310962,
401
+ "grad_norm": 61.866180419921875,
402
+ "learning_rate": 2.454635843897589e-05,
403
+ "loss": 0.9687,
404
+ "step": 2500
405
+ },
406
+ {
407
+ "epoch": 5.704697986577181,
408
+ "grad_norm": 54.289512634277344,
409
+ "learning_rate": 2.392493164305245e-05,
410
+ "loss": 0.986,
411
+ "step": 2550
412
+ },
413
+ {
414
+ "epoch": 5.8165548098434,
415
+ "grad_norm": 32.10563659667969,
416
+ "learning_rate": 2.330350484712901e-05,
417
+ "loss": 0.981,
418
+ "step": 2600
419
+ },
420
+ {
421
+ "epoch": 5.9284116331096195,
422
+ "grad_norm": 34.02315139770508,
423
+ "learning_rate": 2.2682078051205568e-05,
424
+ "loss": 0.9679,
425
+ "step": 2650
426
+ },
427
+ {
428
+ "epoch": 6.0,
429
+ "eval_accuracy": 0.8678185745140389,
430
+ "eval_loss": 0.3706440329551697,
431
+ "eval_runtime": 19.6479,
432
+ "eval_samples_per_second": 117.824,
433
+ "eval_steps_per_second": 3.715,
434
+ "step": 2682
435
+ },
436
+ {
437
+ "epoch": 6.040268456375839,
438
+ "grad_norm": 36.86701965332031,
439
+ "learning_rate": 2.2060651255282127e-05,
440
+ "loss": 0.9318,
441
+ "step": 2700
442
+ },
443
+ {
444
+ "epoch": 6.152125279642058,
445
+ "grad_norm": 42.711204528808594,
446
+ "learning_rate": 2.145165299527716e-05,
447
+ "loss": 0.703,
448
+ "step": 2750
449
+ },
450
+ {
451
+ "epoch": 6.263982102908278,
452
+ "grad_norm": 38.12577819824219,
453
+ "learning_rate": 2.0830226199353717e-05,
454
+ "loss": 0.764,
455
+ "step": 2800
456
+ },
457
+ {
458
+ "epoch": 6.375838926174497,
459
+ "grad_norm": 32.06464767456055,
460
+ "learning_rate": 2.0208799403430276e-05,
461
+ "loss": 0.739,
462
+ "step": 2850
463
+ },
464
+ {
465
+ "epoch": 6.487695749440716,
466
+ "grad_norm": 36.63414001464844,
467
+ "learning_rate": 1.9587372607506838e-05,
468
+ "loss": 0.7173,
469
+ "step": 2900
470
+ },
471
+ {
472
+ "epoch": 6.599552572706935,
473
+ "grad_norm": 61.28206253051758,
474
+ "learning_rate": 1.8965945811583396e-05,
475
+ "loss": 0.7164,
476
+ "step": 2950
477
+ },
478
+ {
479
+ "epoch": 6.7114093959731544,
480
+ "grad_norm": 27.83030891418457,
481
+ "learning_rate": 1.8344519015659954e-05,
482
+ "loss": 0.7629,
483
+ "step": 3000
484
+ },
485
+ {
486
+ "epoch": 6.823266219239374,
487
+ "grad_norm": 46.308189392089844,
488
+ "learning_rate": 1.7723092219736516e-05,
489
+ "loss": 0.7298,
490
+ "step": 3050
491
+ },
492
+ {
493
+ "epoch": 6.935123042505593,
494
+ "grad_norm": 63.58484649658203,
495
+ "learning_rate": 1.7101665423813078e-05,
496
+ "loss": 0.6876,
497
+ "step": 3100
498
+ },
499
+ {
500
+ "epoch": 7.0,
501
+ "eval_accuracy": 0.8825053995680345,
502
+ "eval_loss": 0.3750421702861786,
503
+ "eval_runtime": 19.8931,
504
+ "eval_samples_per_second": 116.372,
505
+ "eval_steps_per_second": 3.67,
506
+ "step": 3129
507
+ },
508
+ {
509
+ "epoch": 7.046979865771812,
510
+ "grad_norm": 44.923614501953125,
511
+ "learning_rate": 1.6480238627889636e-05,
512
+ "loss": 0.6694,
513
+ "step": 3150
514
+ },
515
+ {
516
+ "epoch": 7.158836689038031,
517
+ "grad_norm": 55.236083984375,
518
+ "learning_rate": 1.5858811831966194e-05,
519
+ "loss": 0.5705,
520
+ "step": 3200
521
+ },
522
+ {
523
+ "epoch": 7.27069351230425,
524
+ "grad_norm": 57.69952392578125,
525
+ "learning_rate": 1.5237385036042756e-05,
526
+ "loss": 0.5369,
527
+ "step": 3250
528
+ },
529
+ {
530
+ "epoch": 7.382550335570469,
531
+ "grad_norm": 48.969078063964844,
532
+ "learning_rate": 1.4615958240119314e-05,
533
+ "loss": 0.5266,
534
+ "step": 3300
535
+ },
536
+ {
537
+ "epoch": 7.494407158836689,
538
+ "grad_norm": 30.38613510131836,
539
+ "learning_rate": 1.3994531444195874e-05,
540
+ "loss": 0.6426,
541
+ "step": 3350
542
+ },
543
+ {
544
+ "epoch": 7.6062639821029085,
545
+ "grad_norm": 32.06161880493164,
546
+ "learning_rate": 1.3385533184190904e-05,
547
+ "loss": 0.5366,
548
+ "step": 3400
549
+ },
550
+ {
551
+ "epoch": 7.718120805369128,
552
+ "grad_norm": 41.513572692871094,
553
+ "learning_rate": 1.2764106388267464e-05,
554
+ "loss": 0.536,
555
+ "step": 3450
556
+ },
557
+ {
558
+ "epoch": 7.829977628635347,
559
+ "grad_norm": 64.65313720703125,
560
+ "learning_rate": 1.2142679592344022e-05,
561
+ "loss": 0.5542,
562
+ "step": 3500
563
+ },
564
+ {
565
+ "epoch": 7.941834451901566,
566
+ "grad_norm": 59.38785934448242,
567
+ "learning_rate": 1.1521252796420582e-05,
568
+ "loss": 0.5386,
569
+ "step": 3550
570
+ },
571
+ {
572
+ "epoch": 8.0,
573
+ "eval_accuracy": 0.8825053995680345,
574
+ "eval_loss": 0.42663174867630005,
575
+ "eval_runtime": 19.6465,
576
+ "eval_samples_per_second": 117.833,
577
+ "eval_steps_per_second": 3.716,
578
+ "step": 3576
579
+ }
580
+ ],
581
+ "logging_steps": 50,
582
+ "max_steps": 4470,
583
+ "num_input_tokens_seen": 0,
584
+ "num_train_epochs": 10,
585
+ "save_steps": 500,
586
+ "stateful_callbacks": {
587
+ "EarlyStoppingCallback": {
588
+ "args": {
589
+ "early_stopping_patience": 3,
590
+ "early_stopping_threshold": 0.0
591
+ },
592
+ "attributes": {
593
+ "early_stopping_patience_counter": 1
594
+ }
595
+ },
596
+ "TrainerControl": {
597
+ "args": {
598
+ "should_epoch_stop": false,
599
+ "should_evaluate": false,
600
+ "should_log": false,
601
+ "should_save": true,
602
+ "should_training_stop": false
603
+ },
604
+ "attributes": {}
605
+ }
606
+ },
607
+ "total_flos": 8.013346575910502e+16,
608
+ "train_batch_size": 16,
609
+ "trial_name": null,
610
+ "trial_params": null
611
+ }
checkpoint-3576/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ca2cef10b94920dcbcc3b9cd54c0948da4a0947897ea88575e16e1dc113b07
3
+ size 5240
checkpoint-4023/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "FacebookAI/xlm-roberta-base",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 16,
30
+ "output_past": true,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 250002
39
+ }
checkpoint-4023/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd4f95342878d7d2f24f7999e41c1ad7dcdb6d73e2964546df43bc570c243e01
3
+ size 1225621956
checkpoint-4023/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:684b50e3317b874bf36f03ddb40a33e52208cabf89cb230aef3925a3b50825e9
3
+ size 2451401082
checkpoint-4023/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cab76c35b24bdd32a516ba6363ea59b907aaca829410a9640913c80c81ad587
3
+ size 14244
checkpoint-4023/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bec07c87c6577fe5c747e9e8d01a6a77367eda016af00dce3acc073b0c1c112
3
+ size 1064
checkpoint-4023/trainer_state.json ADDED
@@ -0,0 +1,683 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8859611231101512,
3
+ "best_model_checkpoint": "./sentiment_model/checkpoint-4023",
4
+ "epoch": 9.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4023,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11185682326621924,
13
+ "grad_norm": 18.68804168701172,
14
+ "learning_rate": 5.3691275167785235e-06,
15
+ "loss": 4.4686,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.22371364653243847,
20
+ "grad_norm": 33.30501937866211,
21
+ "learning_rate": 1.0738255033557047e-05,
22
+ "loss": 4.0139,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.33557046979865773,
27
+ "grad_norm": 48.02063751220703,
28
+ "learning_rate": 1.633109619686801e-05,
29
+ "loss": 2.9933,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.44742729306487694,
34
+ "grad_norm": 35.39257049560547,
35
+ "learning_rate": 2.192393736017897e-05,
36
+ "loss": 2.5147,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.5592841163310962,
41
+ "grad_norm": 39.318363189697266,
42
+ "learning_rate": 2.7404921700223713e-05,
43
+ "loss": 2.4221,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.6711409395973155,
48
+ "grad_norm": 50.962833404541016,
49
+ "learning_rate": 3.2997762863534674e-05,
50
+ "loss": 2.3124,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.7829977628635347,
55
+ "grad_norm": 37.32981491088867,
56
+ "learning_rate": 3.859060402684564e-05,
57
+ "loss": 2.4092,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.8948545861297539,
62
+ "grad_norm": 32.42803955078125,
63
+ "learning_rate": 4.4183445190156604e-05,
64
+ "loss": 2.4183,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 1.0,
69
+ "eval_accuracy": 0.7844492440604751,
70
+ "eval_loss": 0.5040740370750427,
71
+ "eval_runtime": 19.6714,
72
+ "eval_samples_per_second": 117.683,
73
+ "eval_steps_per_second": 3.711,
74
+ "step": 447
75
+ },
76
+ {
77
+ "epoch": 1.0067114093959733,
78
+ "grad_norm": 20.918420791625977,
79
+ "learning_rate": 4.977628635346757e-05,
80
+ "loss": 2.3532,
81
+ "step": 450
82
+ },
83
+ {
84
+ "epoch": 1.1185682326621924,
85
+ "grad_norm": 28.06084442138672,
86
+ "learning_rate": 4.94034302759135e-05,
87
+ "loss": 2.1244,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 1.2304250559284116,
92
+ "grad_norm": 40.768096923828125,
93
+ "learning_rate": 4.8782003479990054e-05,
94
+ "loss": 2.3004,
95
+ "step": 550
96
+ },
97
+ {
98
+ "epoch": 1.342281879194631,
99
+ "grad_norm": 19.545913696289062,
100
+ "learning_rate": 4.816057668406662e-05,
101
+ "loss": 2.2399,
102
+ "step": 600
103
+ },
104
+ {
105
+ "epoch": 1.45413870246085,
106
+ "grad_norm": 69.29216766357422,
107
+ "learning_rate": 4.753914988814318e-05,
108
+ "loss": 2.1458,
109
+ "step": 650
110
+ },
111
+ {
112
+ "epoch": 1.5659955257270695,
113
+ "grad_norm": 29.00174903869629,
114
+ "learning_rate": 4.691772309221974e-05,
115
+ "loss": 2.0778,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 1.6778523489932886,
120
+ "grad_norm": 28.77162742614746,
121
+ "learning_rate": 4.62962962962963e-05,
122
+ "loss": 2.1317,
123
+ "step": 750
124
+ },
125
+ {
126
+ "epoch": 1.7897091722595078,
127
+ "grad_norm": 36.04485321044922,
128
+ "learning_rate": 4.5674869500372856e-05,
129
+ "loss": 2.1345,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 1.901565995525727,
134
+ "grad_norm": 51.31829071044922,
135
+ "learning_rate": 4.505344270444942e-05,
136
+ "loss": 2.2048,
137
+ "step": 850
138
+ },
139
+ {
140
+ "epoch": 2.0,
141
+ "eval_accuracy": 0.8138228941684665,
142
+ "eval_loss": 0.43246370553970337,
143
+ "eval_runtime": 19.9941,
144
+ "eval_samples_per_second": 115.784,
145
+ "eval_steps_per_second": 3.651,
146
+ "step": 894
147
+ },
148
+ {
149
+ "epoch": 2.0134228187919465,
150
+ "grad_norm": 19.70203971862793,
151
+ "learning_rate": 4.443201590852597e-05,
152
+ "loss": 2.0482,
153
+ "step": 900
154
+ },
155
+ {
156
+ "epoch": 2.1252796420581657,
157
+ "grad_norm": 44.575721740722656,
158
+ "learning_rate": 4.381058911260254e-05,
159
+ "loss": 1.8304,
160
+ "step": 950
161
+ },
162
+ {
163
+ "epoch": 2.237136465324385,
164
+ "grad_norm": 48.66209411621094,
165
+ "learning_rate": 4.31891623166791e-05,
166
+ "loss": 1.7784,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 2.348993288590604,
171
+ "grad_norm": 19.984678268432617,
172
+ "learning_rate": 4.256773552075566e-05,
173
+ "loss": 1.8457,
174
+ "step": 1050
175
+ },
176
+ {
177
+ "epoch": 2.460850111856823,
178
+ "grad_norm": 64.9332275390625,
179
+ "learning_rate": 4.194630872483222e-05,
180
+ "loss": 1.7747,
181
+ "step": 1100
182
+ },
183
+ {
184
+ "epoch": 2.5727069351230423,
185
+ "grad_norm": 20.591524124145508,
186
+ "learning_rate": 4.1324881928908774e-05,
187
+ "loss": 1.9082,
188
+ "step": 1150
189
+ },
190
+ {
191
+ "epoch": 2.684563758389262,
192
+ "grad_norm": 24.512027740478516,
193
+ "learning_rate": 4.0703455132985336e-05,
194
+ "loss": 1.8399,
195
+ "step": 1200
196
+ },
197
+ {
198
+ "epoch": 2.796420581655481,
199
+ "grad_norm": 18.300504684448242,
200
+ "learning_rate": 4.00820283370619e-05,
201
+ "loss": 1.7937,
202
+ "step": 1250
203
+ },
204
+ {
205
+ "epoch": 2.9082774049217,
206
+ "grad_norm": 40.87727737426758,
207
+ "learning_rate": 3.946060154113845e-05,
208
+ "loss": 1.8174,
209
+ "step": 1300
210
+ },
211
+ {
212
+ "epoch": 3.0,
213
+ "eval_accuracy": 0.8535637149028078,
214
+ "eval_loss": 0.37263187766075134,
215
+ "eval_runtime": 19.6701,
216
+ "eval_samples_per_second": 117.691,
217
+ "eval_steps_per_second": 3.711,
218
+ "step": 1341
219
+ },
220
+ {
221
+ "epoch": 3.0201342281879193,
222
+ "grad_norm": 29.54434585571289,
223
+ "learning_rate": 3.883917474521502e-05,
224
+ "loss": 1.8262,
225
+ "step": 1350
226
+ },
227
+ {
228
+ "epoch": 3.131991051454139,
229
+ "grad_norm": 19.222951889038086,
230
+ "learning_rate": 3.8217747949291576e-05,
231
+ "loss": 1.5486,
232
+ "step": 1400
233
+ },
234
+ {
235
+ "epoch": 3.243847874720358,
236
+ "grad_norm": 36.32904052734375,
237
+ "learning_rate": 3.759632115336814e-05,
238
+ "loss": 1.5273,
239
+ "step": 1450
240
+ },
241
+ {
242
+ "epoch": 3.3557046979865772,
243
+ "grad_norm": 27.05373764038086,
244
+ "learning_rate": 3.697489435744469e-05,
245
+ "loss": 1.5796,
246
+ "step": 1500
247
+ },
248
+ {
249
+ "epoch": 3.4675615212527964,
250
+ "grad_norm": 50.08060836791992,
251
+ "learning_rate": 3.6353467561521254e-05,
252
+ "loss": 1.5104,
253
+ "step": 1550
254
+ },
255
+ {
256
+ "epoch": 3.5794183445190155,
257
+ "grad_norm": 29.98794174194336,
258
+ "learning_rate": 3.5732040765597816e-05,
259
+ "loss": 1.6298,
260
+ "step": 1600
261
+ },
262
+ {
263
+ "epoch": 3.6912751677852347,
264
+ "grad_norm": 33.08045196533203,
265
+ "learning_rate": 3.511061396967437e-05,
266
+ "loss": 1.4769,
267
+ "step": 1650
268
+ },
269
+ {
270
+ "epoch": 3.8031319910514543,
271
+ "grad_norm": 30.475862503051758,
272
+ "learning_rate": 3.448918717375094e-05,
273
+ "loss": 1.547,
274
+ "step": 1700
275
+ },
276
+ {
277
+ "epoch": 3.9149888143176734,
278
+ "grad_norm": 29.52466583251953,
279
+ "learning_rate": 3.3867760377827495e-05,
280
+ "loss": 1.4918,
281
+ "step": 1750
282
+ },
283
+ {
284
+ "epoch": 4.0,
285
+ "eval_accuracy": 0.8501079913606912,
286
+ "eval_loss": 0.38409608602523804,
287
+ "eval_runtime": 19.6128,
288
+ "eval_samples_per_second": 118.035,
289
+ "eval_steps_per_second": 3.722,
290
+ "step": 1788
291
+ },
292
+ {
293
+ "epoch": 4.026845637583893,
294
+ "grad_norm": 27.849456787109375,
295
+ "learning_rate": 3.3246333581904056e-05,
296
+ "loss": 1.4653,
297
+ "step": 1800
298
+ },
299
+ {
300
+ "epoch": 4.138702460850112,
301
+ "grad_norm": 39.9719123840332,
302
+ "learning_rate": 3.262490678598061e-05,
303
+ "loss": 1.2247,
304
+ "step": 1850
305
+ },
306
+ {
307
+ "epoch": 4.250559284116331,
308
+ "grad_norm": 43.72661590576172,
309
+ "learning_rate": 3.200347999005717e-05,
310
+ "loss": 1.2065,
311
+ "step": 1900
312
+ },
313
+ {
314
+ "epoch": 4.3624161073825505,
315
+ "grad_norm": 29.79233169555664,
316
+ "learning_rate": 3.1382053194133735e-05,
317
+ "loss": 1.2585,
318
+ "step": 1950
319
+ },
320
+ {
321
+ "epoch": 4.47427293064877,
322
+ "grad_norm": 30.871849060058594,
323
+ "learning_rate": 3.076062639821029e-05,
324
+ "loss": 1.2602,
325
+ "step": 2000
326
+ },
327
+ {
328
+ "epoch": 4.586129753914989,
329
+ "grad_norm": 30.835567474365234,
330
+ "learning_rate": 3.0139199602286848e-05,
331
+ "loss": 1.2781,
332
+ "step": 2050
333
+ },
334
+ {
335
+ "epoch": 4.697986577181208,
336
+ "grad_norm": 37.51513671875,
337
+ "learning_rate": 2.9517772806363413e-05,
338
+ "loss": 1.228,
339
+ "step": 2100
340
+ },
341
+ {
342
+ "epoch": 4.809843400447427,
343
+ "grad_norm": 35.483665466308594,
344
+ "learning_rate": 2.889634601043997e-05,
345
+ "loss": 1.2876,
346
+ "step": 2150
347
+ },
348
+ {
349
+ "epoch": 4.921700223713646,
350
+ "grad_norm": 21.087987899780273,
351
+ "learning_rate": 2.8274919214516533e-05,
352
+ "loss": 1.2457,
353
+ "step": 2200
354
+ },
355
+ {
356
+ "epoch": 5.0,
357
+ "eval_accuracy": 0.8660907127429806,
358
+ "eval_loss": 0.3854082524776459,
359
+ "eval_runtime": 19.9697,
360
+ "eval_samples_per_second": 115.926,
361
+ "eval_steps_per_second": 3.656,
362
+ "step": 2235
363
+ },
364
+ {
365
+ "epoch": 5.033557046979865,
366
+ "grad_norm": 32.84519577026367,
367
+ "learning_rate": 2.765349241859309e-05,
368
+ "loss": 1.1247,
369
+ "step": 2250
370
+ },
371
+ {
372
+ "epoch": 5.145413870246085,
373
+ "grad_norm": 42.308837890625,
374
+ "learning_rate": 2.703206562266965e-05,
375
+ "loss": 0.9676,
376
+ "step": 2300
377
+ },
378
+ {
379
+ "epoch": 5.257270693512305,
380
+ "grad_norm": 32.68018341064453,
381
+ "learning_rate": 2.6410638826746208e-05,
382
+ "loss": 0.9049,
383
+ "step": 2350
384
+ },
385
+ {
386
+ "epoch": 5.369127516778524,
387
+ "grad_norm": 40.90736770629883,
388
+ "learning_rate": 2.5789212030822766e-05,
389
+ "loss": 0.8973,
390
+ "step": 2400
391
+ },
392
+ {
393
+ "epoch": 5.480984340044743,
394
+ "grad_norm": 42.41788864135742,
395
+ "learning_rate": 2.516778523489933e-05,
396
+ "loss": 0.972,
397
+ "step": 2450
398
+ },
399
+ {
400
+ "epoch": 5.592841163310962,
401
+ "grad_norm": 61.866180419921875,
402
+ "learning_rate": 2.454635843897589e-05,
403
+ "loss": 0.9687,
404
+ "step": 2500
405
+ },
406
+ {
407
+ "epoch": 5.704697986577181,
408
+ "grad_norm": 54.289512634277344,
409
+ "learning_rate": 2.392493164305245e-05,
410
+ "loss": 0.986,
411
+ "step": 2550
412
+ },
413
+ {
414
+ "epoch": 5.8165548098434,
415
+ "grad_norm": 32.10563659667969,
416
+ "learning_rate": 2.330350484712901e-05,
417
+ "loss": 0.981,
418
+ "step": 2600
419
+ },
420
+ {
421
+ "epoch": 5.9284116331096195,
422
+ "grad_norm": 34.02315139770508,
423
+ "learning_rate": 2.2682078051205568e-05,
424
+ "loss": 0.9679,
425
+ "step": 2650
426
+ },
427
+ {
428
+ "epoch": 6.0,
429
+ "eval_accuracy": 0.8678185745140389,
430
+ "eval_loss": 0.3706440329551697,
431
+ "eval_runtime": 19.6479,
432
+ "eval_samples_per_second": 117.824,
433
+ "eval_steps_per_second": 3.715,
434
+ "step": 2682
435
+ },
436
+ {
437
+ "epoch": 6.040268456375839,
438
+ "grad_norm": 36.86701965332031,
439
+ "learning_rate": 2.2060651255282127e-05,
440
+ "loss": 0.9318,
441
+ "step": 2700
442
+ },
443
+ {
444
+ "epoch": 6.152125279642058,
445
+ "grad_norm": 42.711204528808594,
446
+ "learning_rate": 2.145165299527716e-05,
447
+ "loss": 0.703,
448
+ "step": 2750
449
+ },
450
+ {
451
+ "epoch": 6.263982102908278,
452
+ "grad_norm": 38.12577819824219,
453
+ "learning_rate": 2.0830226199353717e-05,
454
+ "loss": 0.764,
455
+ "step": 2800
456
+ },
457
+ {
458
+ "epoch": 6.375838926174497,
459
+ "grad_norm": 32.06464767456055,
460
+ "learning_rate": 2.0208799403430276e-05,
461
+ "loss": 0.739,
462
+ "step": 2850
463
+ },
464
+ {
465
+ "epoch": 6.487695749440716,
466
+ "grad_norm": 36.63414001464844,
467
+ "learning_rate": 1.9587372607506838e-05,
468
+ "loss": 0.7173,
469
+ "step": 2900
470
+ },
471
+ {
472
+ "epoch": 6.599552572706935,
473
+ "grad_norm": 61.28206253051758,
474
+ "learning_rate": 1.8965945811583396e-05,
475
+ "loss": 0.7164,
476
+ "step": 2950
477
+ },
478
+ {
479
+ "epoch": 6.7114093959731544,
480
+ "grad_norm": 27.83030891418457,
481
+ "learning_rate": 1.8344519015659954e-05,
482
+ "loss": 0.7629,
483
+ "step": 3000
484
+ },
485
+ {
486
+ "epoch": 6.823266219239374,
487
+ "grad_norm": 46.308189392089844,
488
+ "learning_rate": 1.7723092219736516e-05,
489
+ "loss": 0.7298,
490
+ "step": 3050
491
+ },
492
+ {
493
+ "epoch": 6.935123042505593,
494
+ "grad_norm": 63.58484649658203,
495
+ "learning_rate": 1.7101665423813078e-05,
496
+ "loss": 0.6876,
497
+ "step": 3100
498
+ },
499
+ {
500
+ "epoch": 7.0,
501
+ "eval_accuracy": 0.8825053995680345,
502
+ "eval_loss": 0.3750421702861786,
503
+ "eval_runtime": 19.8931,
504
+ "eval_samples_per_second": 116.372,
505
+ "eval_steps_per_second": 3.67,
506
+ "step": 3129
507
+ },
508
+ {
509
+ "epoch": 7.046979865771812,
510
+ "grad_norm": 44.923614501953125,
511
+ "learning_rate": 1.6480238627889636e-05,
512
+ "loss": 0.6694,
513
+ "step": 3150
514
+ },
515
+ {
516
+ "epoch": 7.158836689038031,
517
+ "grad_norm": 55.236083984375,
518
+ "learning_rate": 1.5858811831966194e-05,
519
+ "loss": 0.5705,
520
+ "step": 3200
521
+ },
522
+ {
523
+ "epoch": 7.27069351230425,
524
+ "grad_norm": 57.69952392578125,
525
+ "learning_rate": 1.5237385036042756e-05,
526
+ "loss": 0.5369,
527
+ "step": 3250
528
+ },
529
+ {
530
+ "epoch": 7.382550335570469,
531
+ "grad_norm": 48.969078063964844,
532
+ "learning_rate": 1.4615958240119314e-05,
533
+ "loss": 0.5266,
534
+ "step": 3300
535
+ },
536
+ {
537
+ "epoch": 7.494407158836689,
538
+ "grad_norm": 30.38613510131836,
539
+ "learning_rate": 1.3994531444195874e-05,
540
+ "loss": 0.6426,
541
+ "step": 3350
542
+ },
543
+ {
544
+ "epoch": 7.6062639821029085,
545
+ "grad_norm": 32.06161880493164,
546
+ "learning_rate": 1.3385533184190904e-05,
547
+ "loss": 0.5366,
548
+ "step": 3400
549
+ },
550
+ {
551
+ "epoch": 7.718120805369128,
552
+ "grad_norm": 41.513572692871094,
553
+ "learning_rate": 1.2764106388267464e-05,
554
+ "loss": 0.536,
555
+ "step": 3450
556
+ },
557
+ {
558
+ "epoch": 7.829977628635347,
559
+ "grad_norm": 64.65313720703125,
560
+ "learning_rate": 1.2142679592344022e-05,
561
+ "loss": 0.5542,
562
+ "step": 3500
563
+ },
564
+ {
565
+ "epoch": 7.941834451901566,
566
+ "grad_norm": 59.38785934448242,
567
+ "learning_rate": 1.1521252796420582e-05,
568
+ "loss": 0.5386,
569
+ "step": 3550
570
+ },
571
+ {
572
+ "epoch": 8.0,
573
+ "eval_accuracy": 0.8825053995680345,
574
+ "eval_loss": 0.42663174867630005,
575
+ "eval_runtime": 19.6465,
576
+ "eval_samples_per_second": 117.833,
577
+ "eval_steps_per_second": 3.716,
578
+ "step": 3576
579
+ },
580
+ {
581
+ "epoch": 8.053691275167786,
582
+ "grad_norm": 46.52591323852539,
583
+ "learning_rate": 1.0899826000497142e-05,
584
+ "loss": 0.4777,
585
+ "step": 3600
586
+ },
587
+ {
588
+ "epoch": 8.165548098434005,
589
+ "grad_norm": 64.87262725830078,
590
+ "learning_rate": 1.0278399204573702e-05,
591
+ "loss": 0.447,
592
+ "step": 3650
593
+ },
594
+ {
595
+ "epoch": 8.277404921700224,
596
+ "grad_norm": 32.57583999633789,
597
+ "learning_rate": 9.65697240865026e-06,
598
+ "loss": 0.4654,
599
+ "step": 3700
600
+ },
601
+ {
602
+ "epoch": 8.389261744966444,
603
+ "grad_norm": 48.06557846069336,
604
+ "learning_rate": 9.035545612726822e-06,
605
+ "loss": 0.4814,
606
+ "step": 3750
607
+ },
608
+ {
609
+ "epoch": 8.501118568232663,
610
+ "grad_norm": 30.20526695251465,
611
+ "learning_rate": 8.41411881680338e-06,
612
+ "loss": 0.382,
613
+ "step": 3800
614
+ },
615
+ {
616
+ "epoch": 8.612975391498882,
617
+ "grad_norm": 24.92857551574707,
618
+ "learning_rate": 7.79269202087994e-06,
619
+ "loss": 0.357,
620
+ "step": 3850
621
+ },
622
+ {
623
+ "epoch": 8.724832214765101,
624
+ "grad_norm": 50.68805694580078,
625
+ "learning_rate": 7.1836937608749695e-06,
626
+ "loss": 0.4397,
627
+ "step": 3900
628
+ },
629
+ {
630
+ "epoch": 8.83668903803132,
631
+ "grad_norm": 6.66060733795166,
632
+ "learning_rate": 6.562266964951529e-06,
633
+ "loss": 0.4038,
634
+ "step": 3950
635
+ },
636
+ {
637
+ "epoch": 8.94854586129754,
638
+ "grad_norm": 11.322134017944336,
639
+ "learning_rate": 5.940840169028089e-06,
640
+ "loss": 0.4638,
641
+ "step": 4000
642
+ },
643
+ {
644
+ "epoch": 9.0,
645
+ "eval_accuracy": 0.8859611231101512,
646
+ "eval_loss": 0.45952075719833374,
647
+ "eval_runtime": 19.6573,
648
+ "eval_samples_per_second": 117.768,
649
+ "eval_steps_per_second": 3.714,
650
+ "step": 4023
651
+ }
652
+ ],
653
+ "logging_steps": 50,
654
+ "max_steps": 4470,
655
+ "num_input_tokens_seen": 0,
656
+ "num_train_epochs": 10,
657
+ "save_steps": 500,
658
+ "stateful_callbacks": {
659
+ "EarlyStoppingCallback": {
660
+ "args": {
661
+ "early_stopping_patience": 3,
662
+ "early_stopping_threshold": 0.0
663
+ },
664
+ "attributes": {
665
+ "early_stopping_patience_counter": 0
666
+ }
667
+ },
668
+ "TrainerControl": {
669
+ "args": {
670
+ "should_epoch_stop": false,
671
+ "should_evaluate": false,
672
+ "should_log": false,
673
+ "should_save": true,
674
+ "should_training_stop": false
675
+ },
676
+ "attributes": {}
677
+ }
678
+ },
679
+ "total_flos": 9.015014897899315e+16,
680
+ "train_batch_size": 16,
681
+ "trial_name": null,
682
+ "trial_params": null
683
+ }
checkpoint-4023/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ca2cef10b94920dcbcc3b9cd54c0948da4a0947897ea88575e16e1dc113b07
3
+ size 5240
checkpoint-4470/config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "FacebookAI/xlm-roberta-base",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 16,
30
+ "output_past": true,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 250002
39
+ }
checkpoint-4470/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:19b783964f48a24eb5dcff631d9f4731ba908394851c23861be1d41350494bb9
3
+ size 1225621956
checkpoint-4470/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db1688d5d8ded2d0a37c5bc8c6d823efcbdca794c510ceb543e7cc05415cff6e
3
+ size 2451401082
checkpoint-4470/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a9299341ef2de240583d3ff7beb83d086ec86f3669a198c44be314f70b55a98
3
+ size 14244
checkpoint-4470/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:28495009a3b93c7ecb81ee55567c92a631d96061f1db4aa11d6cebe54fd5ea35
3
+ size 1064
checkpoint-4470/trainer_state.json ADDED
@@ -0,0 +1,755 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 0.8859611231101512,
3
+ "best_model_checkpoint": "./sentiment_model/checkpoint-4023",
4
+ "epoch": 10.0,
5
+ "eval_steps": 500,
6
+ "global_step": 4470,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.11185682326621924,
13
+ "grad_norm": 18.68804168701172,
14
+ "learning_rate": 5.3691275167785235e-06,
15
+ "loss": 4.4686,
16
+ "step": 50
17
+ },
18
+ {
19
+ "epoch": 0.22371364653243847,
20
+ "grad_norm": 33.30501937866211,
21
+ "learning_rate": 1.0738255033557047e-05,
22
+ "loss": 4.0139,
23
+ "step": 100
24
+ },
25
+ {
26
+ "epoch": 0.33557046979865773,
27
+ "grad_norm": 48.02063751220703,
28
+ "learning_rate": 1.633109619686801e-05,
29
+ "loss": 2.9933,
30
+ "step": 150
31
+ },
32
+ {
33
+ "epoch": 0.44742729306487694,
34
+ "grad_norm": 35.39257049560547,
35
+ "learning_rate": 2.192393736017897e-05,
36
+ "loss": 2.5147,
37
+ "step": 200
38
+ },
39
+ {
40
+ "epoch": 0.5592841163310962,
41
+ "grad_norm": 39.318363189697266,
42
+ "learning_rate": 2.7404921700223713e-05,
43
+ "loss": 2.4221,
44
+ "step": 250
45
+ },
46
+ {
47
+ "epoch": 0.6711409395973155,
48
+ "grad_norm": 50.962833404541016,
49
+ "learning_rate": 3.2997762863534674e-05,
50
+ "loss": 2.3124,
51
+ "step": 300
52
+ },
53
+ {
54
+ "epoch": 0.7829977628635347,
55
+ "grad_norm": 37.32981491088867,
56
+ "learning_rate": 3.859060402684564e-05,
57
+ "loss": 2.4092,
58
+ "step": 350
59
+ },
60
+ {
61
+ "epoch": 0.8948545861297539,
62
+ "grad_norm": 32.42803955078125,
63
+ "learning_rate": 4.4183445190156604e-05,
64
+ "loss": 2.4183,
65
+ "step": 400
66
+ },
67
+ {
68
+ "epoch": 1.0,
69
+ "eval_accuracy": 0.7844492440604751,
70
+ "eval_loss": 0.5040740370750427,
71
+ "eval_runtime": 19.6714,
72
+ "eval_samples_per_second": 117.683,
73
+ "eval_steps_per_second": 3.711,
74
+ "step": 447
75
+ },
76
+ {
77
+ "epoch": 1.0067114093959733,
78
+ "grad_norm": 20.918420791625977,
79
+ "learning_rate": 4.977628635346757e-05,
80
+ "loss": 2.3532,
81
+ "step": 450
82
+ },
83
+ {
84
+ "epoch": 1.1185682326621924,
85
+ "grad_norm": 28.06084442138672,
86
+ "learning_rate": 4.94034302759135e-05,
87
+ "loss": 2.1244,
88
+ "step": 500
89
+ },
90
+ {
91
+ "epoch": 1.2304250559284116,
92
+ "grad_norm": 40.768096923828125,
93
+ "learning_rate": 4.8782003479990054e-05,
94
+ "loss": 2.3004,
95
+ "step": 550
96
+ },
97
+ {
98
+ "epoch": 1.342281879194631,
99
+ "grad_norm": 19.545913696289062,
100
+ "learning_rate": 4.816057668406662e-05,
101
+ "loss": 2.2399,
102
+ "step": 600
103
+ },
104
+ {
105
+ "epoch": 1.45413870246085,
106
+ "grad_norm": 69.29216766357422,
107
+ "learning_rate": 4.753914988814318e-05,
108
+ "loss": 2.1458,
109
+ "step": 650
110
+ },
111
+ {
112
+ "epoch": 1.5659955257270695,
113
+ "grad_norm": 29.00174903869629,
114
+ "learning_rate": 4.691772309221974e-05,
115
+ "loss": 2.0778,
116
+ "step": 700
117
+ },
118
+ {
119
+ "epoch": 1.6778523489932886,
120
+ "grad_norm": 28.77162742614746,
121
+ "learning_rate": 4.62962962962963e-05,
122
+ "loss": 2.1317,
123
+ "step": 750
124
+ },
125
+ {
126
+ "epoch": 1.7897091722595078,
127
+ "grad_norm": 36.04485321044922,
128
+ "learning_rate": 4.5674869500372856e-05,
129
+ "loss": 2.1345,
130
+ "step": 800
131
+ },
132
+ {
133
+ "epoch": 1.901565995525727,
134
+ "grad_norm": 51.31829071044922,
135
+ "learning_rate": 4.505344270444942e-05,
136
+ "loss": 2.2048,
137
+ "step": 850
138
+ },
139
+ {
140
+ "epoch": 2.0,
141
+ "eval_accuracy": 0.8138228941684665,
142
+ "eval_loss": 0.43246370553970337,
143
+ "eval_runtime": 19.9941,
144
+ "eval_samples_per_second": 115.784,
145
+ "eval_steps_per_second": 3.651,
146
+ "step": 894
147
+ },
148
+ {
149
+ "epoch": 2.0134228187919465,
150
+ "grad_norm": 19.70203971862793,
151
+ "learning_rate": 4.443201590852597e-05,
152
+ "loss": 2.0482,
153
+ "step": 900
154
+ },
155
+ {
156
+ "epoch": 2.1252796420581657,
157
+ "grad_norm": 44.575721740722656,
158
+ "learning_rate": 4.381058911260254e-05,
159
+ "loss": 1.8304,
160
+ "step": 950
161
+ },
162
+ {
163
+ "epoch": 2.237136465324385,
164
+ "grad_norm": 48.66209411621094,
165
+ "learning_rate": 4.31891623166791e-05,
166
+ "loss": 1.7784,
167
+ "step": 1000
168
+ },
169
+ {
170
+ "epoch": 2.348993288590604,
171
+ "grad_norm": 19.984678268432617,
172
+ "learning_rate": 4.256773552075566e-05,
173
+ "loss": 1.8457,
174
+ "step": 1050
175
+ },
176
+ {
177
+ "epoch": 2.460850111856823,
178
+ "grad_norm": 64.9332275390625,
179
+ "learning_rate": 4.194630872483222e-05,
180
+ "loss": 1.7747,
181
+ "step": 1100
182
+ },
183
+ {
184
+ "epoch": 2.5727069351230423,
185
+ "grad_norm": 20.591524124145508,
186
+ "learning_rate": 4.1324881928908774e-05,
187
+ "loss": 1.9082,
188
+ "step": 1150
189
+ },
190
+ {
191
+ "epoch": 2.684563758389262,
192
+ "grad_norm": 24.512027740478516,
193
+ "learning_rate": 4.0703455132985336e-05,
194
+ "loss": 1.8399,
195
+ "step": 1200
196
+ },
197
+ {
198
+ "epoch": 2.796420581655481,
199
+ "grad_norm": 18.300504684448242,
200
+ "learning_rate": 4.00820283370619e-05,
201
+ "loss": 1.7937,
202
+ "step": 1250
203
+ },
204
+ {
205
+ "epoch": 2.9082774049217,
206
+ "grad_norm": 40.87727737426758,
207
+ "learning_rate": 3.946060154113845e-05,
208
+ "loss": 1.8174,
209
+ "step": 1300
210
+ },
211
+ {
212
+ "epoch": 3.0,
213
+ "eval_accuracy": 0.8535637149028078,
214
+ "eval_loss": 0.37263187766075134,
215
+ "eval_runtime": 19.6701,
216
+ "eval_samples_per_second": 117.691,
217
+ "eval_steps_per_second": 3.711,
218
+ "step": 1341
219
+ },
220
+ {
221
+ "epoch": 3.0201342281879193,
222
+ "grad_norm": 29.54434585571289,
223
+ "learning_rate": 3.883917474521502e-05,
224
+ "loss": 1.8262,
225
+ "step": 1350
226
+ },
227
+ {
228
+ "epoch": 3.131991051454139,
229
+ "grad_norm": 19.222951889038086,
230
+ "learning_rate": 3.8217747949291576e-05,
231
+ "loss": 1.5486,
232
+ "step": 1400
233
+ },
234
+ {
235
+ "epoch": 3.243847874720358,
236
+ "grad_norm": 36.32904052734375,
237
+ "learning_rate": 3.759632115336814e-05,
238
+ "loss": 1.5273,
239
+ "step": 1450
240
+ },
241
+ {
242
+ "epoch": 3.3557046979865772,
243
+ "grad_norm": 27.05373764038086,
244
+ "learning_rate": 3.697489435744469e-05,
245
+ "loss": 1.5796,
246
+ "step": 1500
247
+ },
248
+ {
249
+ "epoch": 3.4675615212527964,
250
+ "grad_norm": 50.08060836791992,
251
+ "learning_rate": 3.6353467561521254e-05,
252
+ "loss": 1.5104,
253
+ "step": 1550
254
+ },
255
+ {
256
+ "epoch": 3.5794183445190155,
257
+ "grad_norm": 29.98794174194336,
258
+ "learning_rate": 3.5732040765597816e-05,
259
+ "loss": 1.6298,
260
+ "step": 1600
261
+ },
262
+ {
263
+ "epoch": 3.6912751677852347,
264
+ "grad_norm": 33.08045196533203,
265
+ "learning_rate": 3.511061396967437e-05,
266
+ "loss": 1.4769,
267
+ "step": 1650
268
+ },
269
+ {
270
+ "epoch": 3.8031319910514543,
271
+ "grad_norm": 30.475862503051758,
272
+ "learning_rate": 3.448918717375094e-05,
273
+ "loss": 1.547,
274
+ "step": 1700
275
+ },
276
+ {
277
+ "epoch": 3.9149888143176734,
278
+ "grad_norm": 29.52466583251953,
279
+ "learning_rate": 3.3867760377827495e-05,
280
+ "loss": 1.4918,
281
+ "step": 1750
282
+ },
283
+ {
284
+ "epoch": 4.0,
285
+ "eval_accuracy": 0.8501079913606912,
286
+ "eval_loss": 0.38409608602523804,
287
+ "eval_runtime": 19.6128,
288
+ "eval_samples_per_second": 118.035,
289
+ "eval_steps_per_second": 3.722,
290
+ "step": 1788
291
+ },
292
+ {
293
+ "epoch": 4.026845637583893,
294
+ "grad_norm": 27.849456787109375,
295
+ "learning_rate": 3.3246333581904056e-05,
296
+ "loss": 1.4653,
297
+ "step": 1800
298
+ },
299
+ {
300
+ "epoch": 4.138702460850112,
301
+ "grad_norm": 39.9719123840332,
302
+ "learning_rate": 3.262490678598061e-05,
303
+ "loss": 1.2247,
304
+ "step": 1850
305
+ },
306
+ {
307
+ "epoch": 4.250559284116331,
308
+ "grad_norm": 43.72661590576172,
309
+ "learning_rate": 3.200347999005717e-05,
310
+ "loss": 1.2065,
311
+ "step": 1900
312
+ },
313
+ {
314
+ "epoch": 4.3624161073825505,
315
+ "grad_norm": 29.79233169555664,
316
+ "learning_rate": 3.1382053194133735e-05,
317
+ "loss": 1.2585,
318
+ "step": 1950
319
+ },
320
+ {
321
+ "epoch": 4.47427293064877,
322
+ "grad_norm": 30.871849060058594,
323
+ "learning_rate": 3.076062639821029e-05,
324
+ "loss": 1.2602,
325
+ "step": 2000
326
+ },
327
+ {
328
+ "epoch": 4.586129753914989,
329
+ "grad_norm": 30.835567474365234,
330
+ "learning_rate": 3.0139199602286848e-05,
331
+ "loss": 1.2781,
332
+ "step": 2050
333
+ },
334
+ {
335
+ "epoch": 4.697986577181208,
336
+ "grad_norm": 37.51513671875,
337
+ "learning_rate": 2.9517772806363413e-05,
338
+ "loss": 1.228,
339
+ "step": 2100
340
+ },
341
+ {
342
+ "epoch": 4.809843400447427,
343
+ "grad_norm": 35.483665466308594,
344
+ "learning_rate": 2.889634601043997e-05,
345
+ "loss": 1.2876,
346
+ "step": 2150
347
+ },
348
+ {
349
+ "epoch": 4.921700223713646,
350
+ "grad_norm": 21.087987899780273,
351
+ "learning_rate": 2.8274919214516533e-05,
352
+ "loss": 1.2457,
353
+ "step": 2200
354
+ },
355
+ {
356
+ "epoch": 5.0,
357
+ "eval_accuracy": 0.8660907127429806,
358
+ "eval_loss": 0.3854082524776459,
359
+ "eval_runtime": 19.9697,
360
+ "eval_samples_per_second": 115.926,
361
+ "eval_steps_per_second": 3.656,
362
+ "step": 2235
363
+ },
364
+ {
365
+ "epoch": 5.033557046979865,
366
+ "grad_norm": 32.84519577026367,
367
+ "learning_rate": 2.765349241859309e-05,
368
+ "loss": 1.1247,
369
+ "step": 2250
370
+ },
371
+ {
372
+ "epoch": 5.145413870246085,
373
+ "grad_norm": 42.308837890625,
374
+ "learning_rate": 2.703206562266965e-05,
375
+ "loss": 0.9676,
376
+ "step": 2300
377
+ },
378
+ {
379
+ "epoch": 5.257270693512305,
380
+ "grad_norm": 32.68018341064453,
381
+ "learning_rate": 2.6410638826746208e-05,
382
+ "loss": 0.9049,
383
+ "step": 2350
384
+ },
385
+ {
386
+ "epoch": 5.369127516778524,
387
+ "grad_norm": 40.90736770629883,
388
+ "learning_rate": 2.5789212030822766e-05,
389
+ "loss": 0.8973,
390
+ "step": 2400
391
+ },
392
+ {
393
+ "epoch": 5.480984340044743,
394
+ "grad_norm": 42.41788864135742,
395
+ "learning_rate": 2.516778523489933e-05,
396
+ "loss": 0.972,
397
+ "step": 2450
398
+ },
399
+ {
400
+ "epoch": 5.592841163310962,
401
+ "grad_norm": 61.866180419921875,
402
+ "learning_rate": 2.454635843897589e-05,
403
+ "loss": 0.9687,
404
+ "step": 2500
405
+ },
406
+ {
407
+ "epoch": 5.704697986577181,
408
+ "grad_norm": 54.289512634277344,
409
+ "learning_rate": 2.392493164305245e-05,
410
+ "loss": 0.986,
411
+ "step": 2550
412
+ },
413
+ {
414
+ "epoch": 5.8165548098434,
415
+ "grad_norm": 32.10563659667969,
416
+ "learning_rate": 2.330350484712901e-05,
417
+ "loss": 0.981,
418
+ "step": 2600
419
+ },
420
+ {
421
+ "epoch": 5.9284116331096195,
422
+ "grad_norm": 34.02315139770508,
423
+ "learning_rate": 2.2682078051205568e-05,
424
+ "loss": 0.9679,
425
+ "step": 2650
426
+ },
427
+ {
428
+ "epoch": 6.0,
429
+ "eval_accuracy": 0.8678185745140389,
430
+ "eval_loss": 0.3706440329551697,
431
+ "eval_runtime": 19.6479,
432
+ "eval_samples_per_second": 117.824,
433
+ "eval_steps_per_second": 3.715,
434
+ "step": 2682
435
+ },
436
+ {
437
+ "epoch": 6.040268456375839,
438
+ "grad_norm": 36.86701965332031,
439
+ "learning_rate": 2.2060651255282127e-05,
440
+ "loss": 0.9318,
441
+ "step": 2700
442
+ },
443
+ {
444
+ "epoch": 6.152125279642058,
445
+ "grad_norm": 42.711204528808594,
446
+ "learning_rate": 2.145165299527716e-05,
447
+ "loss": 0.703,
448
+ "step": 2750
449
+ },
450
+ {
451
+ "epoch": 6.263982102908278,
452
+ "grad_norm": 38.12577819824219,
453
+ "learning_rate": 2.0830226199353717e-05,
454
+ "loss": 0.764,
455
+ "step": 2800
456
+ },
457
+ {
458
+ "epoch": 6.375838926174497,
459
+ "grad_norm": 32.06464767456055,
460
+ "learning_rate": 2.0208799403430276e-05,
461
+ "loss": 0.739,
462
+ "step": 2850
463
+ },
464
+ {
465
+ "epoch": 6.487695749440716,
466
+ "grad_norm": 36.63414001464844,
467
+ "learning_rate": 1.9587372607506838e-05,
468
+ "loss": 0.7173,
469
+ "step": 2900
470
+ },
471
+ {
472
+ "epoch": 6.599552572706935,
473
+ "grad_norm": 61.28206253051758,
474
+ "learning_rate": 1.8965945811583396e-05,
475
+ "loss": 0.7164,
476
+ "step": 2950
477
+ },
478
+ {
479
+ "epoch": 6.7114093959731544,
480
+ "grad_norm": 27.83030891418457,
481
+ "learning_rate": 1.8344519015659954e-05,
482
+ "loss": 0.7629,
483
+ "step": 3000
484
+ },
485
+ {
486
+ "epoch": 6.823266219239374,
487
+ "grad_norm": 46.308189392089844,
488
+ "learning_rate": 1.7723092219736516e-05,
489
+ "loss": 0.7298,
490
+ "step": 3050
491
+ },
492
+ {
493
+ "epoch": 6.935123042505593,
494
+ "grad_norm": 63.58484649658203,
495
+ "learning_rate": 1.7101665423813078e-05,
496
+ "loss": 0.6876,
497
+ "step": 3100
498
+ },
499
+ {
500
+ "epoch": 7.0,
501
+ "eval_accuracy": 0.8825053995680345,
502
+ "eval_loss": 0.3750421702861786,
503
+ "eval_runtime": 19.8931,
504
+ "eval_samples_per_second": 116.372,
505
+ "eval_steps_per_second": 3.67,
506
+ "step": 3129
507
+ },
508
+ {
509
+ "epoch": 7.046979865771812,
510
+ "grad_norm": 44.923614501953125,
511
+ "learning_rate": 1.6480238627889636e-05,
512
+ "loss": 0.6694,
513
+ "step": 3150
514
+ },
515
+ {
516
+ "epoch": 7.158836689038031,
517
+ "grad_norm": 55.236083984375,
518
+ "learning_rate": 1.5858811831966194e-05,
519
+ "loss": 0.5705,
520
+ "step": 3200
521
+ },
522
+ {
523
+ "epoch": 7.27069351230425,
524
+ "grad_norm": 57.69952392578125,
525
+ "learning_rate": 1.5237385036042756e-05,
526
+ "loss": 0.5369,
527
+ "step": 3250
528
+ },
529
+ {
530
+ "epoch": 7.382550335570469,
531
+ "grad_norm": 48.969078063964844,
532
+ "learning_rate": 1.4615958240119314e-05,
533
+ "loss": 0.5266,
534
+ "step": 3300
535
+ },
536
+ {
537
+ "epoch": 7.494407158836689,
538
+ "grad_norm": 30.38613510131836,
539
+ "learning_rate": 1.3994531444195874e-05,
540
+ "loss": 0.6426,
541
+ "step": 3350
542
+ },
543
+ {
544
+ "epoch": 7.6062639821029085,
545
+ "grad_norm": 32.06161880493164,
546
+ "learning_rate": 1.3385533184190904e-05,
547
+ "loss": 0.5366,
548
+ "step": 3400
549
+ },
550
+ {
551
+ "epoch": 7.718120805369128,
552
+ "grad_norm": 41.513572692871094,
553
+ "learning_rate": 1.2764106388267464e-05,
554
+ "loss": 0.536,
555
+ "step": 3450
556
+ },
557
+ {
558
+ "epoch": 7.829977628635347,
559
+ "grad_norm": 64.65313720703125,
560
+ "learning_rate": 1.2142679592344022e-05,
561
+ "loss": 0.5542,
562
+ "step": 3500
563
+ },
564
+ {
565
+ "epoch": 7.941834451901566,
566
+ "grad_norm": 59.38785934448242,
567
+ "learning_rate": 1.1521252796420582e-05,
568
+ "loss": 0.5386,
569
+ "step": 3550
570
+ },
571
+ {
572
+ "epoch": 8.0,
573
+ "eval_accuracy": 0.8825053995680345,
574
+ "eval_loss": 0.42663174867630005,
575
+ "eval_runtime": 19.6465,
576
+ "eval_samples_per_second": 117.833,
577
+ "eval_steps_per_second": 3.716,
578
+ "step": 3576
579
+ },
580
+ {
581
+ "epoch": 8.053691275167786,
582
+ "grad_norm": 46.52591323852539,
583
+ "learning_rate": 1.0899826000497142e-05,
584
+ "loss": 0.4777,
585
+ "step": 3600
586
+ },
587
+ {
588
+ "epoch": 8.165548098434005,
589
+ "grad_norm": 64.87262725830078,
590
+ "learning_rate": 1.0278399204573702e-05,
591
+ "loss": 0.447,
592
+ "step": 3650
593
+ },
594
+ {
595
+ "epoch": 8.277404921700224,
596
+ "grad_norm": 32.57583999633789,
597
+ "learning_rate": 9.65697240865026e-06,
598
+ "loss": 0.4654,
599
+ "step": 3700
600
+ },
601
+ {
602
+ "epoch": 8.389261744966444,
603
+ "grad_norm": 48.06557846069336,
604
+ "learning_rate": 9.035545612726822e-06,
605
+ "loss": 0.4814,
606
+ "step": 3750
607
+ },
608
+ {
609
+ "epoch": 8.501118568232663,
610
+ "grad_norm": 30.20526695251465,
611
+ "learning_rate": 8.41411881680338e-06,
612
+ "loss": 0.382,
613
+ "step": 3800
614
+ },
615
+ {
616
+ "epoch": 8.612975391498882,
617
+ "grad_norm": 24.92857551574707,
618
+ "learning_rate": 7.79269202087994e-06,
619
+ "loss": 0.357,
620
+ "step": 3850
621
+ },
622
+ {
623
+ "epoch": 8.724832214765101,
624
+ "grad_norm": 50.68805694580078,
625
+ "learning_rate": 7.1836937608749695e-06,
626
+ "loss": 0.4397,
627
+ "step": 3900
628
+ },
629
+ {
630
+ "epoch": 8.83668903803132,
631
+ "grad_norm": 6.66060733795166,
632
+ "learning_rate": 6.562266964951529e-06,
633
+ "loss": 0.4038,
634
+ "step": 3950
635
+ },
636
+ {
637
+ "epoch": 8.94854586129754,
638
+ "grad_norm": 11.322134017944336,
639
+ "learning_rate": 5.940840169028089e-06,
640
+ "loss": 0.4638,
641
+ "step": 4000
642
+ },
643
+ {
644
+ "epoch": 9.0,
645
+ "eval_accuracy": 0.8859611231101512,
646
+ "eval_loss": 0.45952075719833374,
647
+ "eval_runtime": 19.6573,
648
+ "eval_samples_per_second": 117.768,
649
+ "eval_steps_per_second": 3.714,
650
+ "step": 4023
651
+ },
652
+ {
653
+ "epoch": 9.060402684563758,
654
+ "grad_norm": 19.315446853637695,
655
+ "learning_rate": 5.319413373104649e-06,
656
+ "loss": 0.3911,
657
+ "step": 4050
658
+ },
659
+ {
660
+ "epoch": 9.172259507829978,
661
+ "grad_norm": 21.74611473083496,
662
+ "learning_rate": 4.697986577181209e-06,
663
+ "loss": 0.3029,
664
+ "step": 4100
665
+ },
666
+ {
667
+ "epoch": 9.284116331096197,
668
+ "grad_norm": 7.977785587310791,
669
+ "learning_rate": 4.076559781257768e-06,
670
+ "loss": 0.2811,
671
+ "step": 4150
672
+ },
673
+ {
674
+ "epoch": 9.395973154362416,
675
+ "grad_norm": 70.15277862548828,
676
+ "learning_rate": 3.455132985334328e-06,
677
+ "loss": 0.3847,
678
+ "step": 4200
679
+ },
680
+ {
681
+ "epoch": 9.507829977628635,
682
+ "grad_norm": 81.53914642333984,
683
+ "learning_rate": 2.8337061894108876e-06,
684
+ "loss": 0.3124,
685
+ "step": 4250
686
+ },
687
+ {
688
+ "epoch": 9.619686800894854,
689
+ "grad_norm": 21.751493453979492,
690
+ "learning_rate": 2.2122793934874472e-06,
691
+ "loss": 0.2797,
692
+ "step": 4300
693
+ },
694
+ {
695
+ "epoch": 9.731543624161073,
696
+ "grad_norm": 69.72137451171875,
697
+ "learning_rate": 1.5908525975640068e-06,
698
+ "loss": 0.3571,
699
+ "step": 4350
700
+ },
701
+ {
702
+ "epoch": 9.843400447427292,
703
+ "grad_norm": 162.82540893554688,
704
+ "learning_rate": 9.694258016405668e-07,
705
+ "loss": 0.3267,
706
+ "step": 4400
707
+ },
708
+ {
709
+ "epoch": 9.955257270693512,
710
+ "grad_norm": 89.5910415649414,
711
+ "learning_rate": 3.4799900571712656e-07,
712
+ "loss": 0.378,
713
+ "step": 4450
714
+ },
715
+ {
716
+ "epoch": 10.0,
717
+ "eval_accuracy": 0.885097192224622,
718
+ "eval_loss": 0.5304617285728455,
719
+ "eval_runtime": 20.0633,
720
+ "eval_samples_per_second": 115.385,
721
+ "eval_steps_per_second": 3.638,
722
+ "step": 4470
723
+ }
724
+ ],
725
+ "logging_steps": 50,
726
+ "max_steps": 4470,
727
+ "num_input_tokens_seen": 0,
728
+ "num_train_epochs": 10,
729
+ "save_steps": 500,
730
+ "stateful_callbacks": {
731
+ "EarlyStoppingCallback": {
732
+ "args": {
733
+ "early_stopping_patience": 3,
734
+ "early_stopping_threshold": 0.0
735
+ },
736
+ "attributes": {
737
+ "early_stopping_patience_counter": 1
738
+ }
739
+ },
740
+ "TrainerControl": {
741
+ "args": {
742
+ "should_epoch_stop": false,
743
+ "should_evaluate": false,
744
+ "should_log": false,
745
+ "should_save": true,
746
+ "should_training_stop": true
747
+ },
748
+ "attributes": {}
749
+ }
750
+ },
751
+ "total_flos": 1.0016683219888128e+17,
752
+ "train_batch_size": 16,
753
+ "trial_name": null,
754
+ "trial_params": null
755
+ }
checkpoint-4470/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b5ca2cef10b94920dcbcc3b9cd54c0948da4a0947897ea88575e16e1dc113b07
3
+ size 5240
config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "FacebookAI/xlm-roberta-base",
3
+ "architectures": [
4
+ "XLMRobertaForSequenceClassification"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.1,
7
+ "bos_token_id": 0,
8
+ "classifier_dropout": null,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.1,
12
+ "hidden_size": 768,
13
+ "id2label": {
14
+ "0": "LABEL_0",
15
+ "1": "LABEL_1",
16
+ "2": "LABEL_2"
17
+ },
18
+ "initializer_range": 0.02,
19
+ "intermediate_size": 3072,
20
+ "label2id": {
21
+ "LABEL_0": 0,
22
+ "LABEL_1": 1,
23
+ "LABEL_2": 2
24
+ },
25
+ "layer_norm_eps": 1e-05,
26
+ "max_position_embeddings": 514,
27
+ "model_type": "xlm-roberta",
28
+ "num_attention_heads": 12,
29
+ "num_hidden_layers": 16,
30
+ "output_past": true,
31
+ "pad_token_id": 1,
32
+ "position_embedding_type": "absolute",
33
+ "problem_type": "single_label_classification",
34
+ "torch_dtype": "float32",
35
+ "transformers_version": "4.47.1",
36
+ "type_vocab_size": 1,
37
+ "use_cache": true,
38
+ "vocab_size": 250002
39
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd4f95342878d7d2f24f7999e41c1ad7dcdb6d73e2964546df43bc570c243e01
3
+ size 1225621956
sentencepiece.bpe.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc8146abe2a0488e9e2a0c56de7952f7c11ab059eca145a0a727afce0db2865
3
+ size 5069051
special_tokens_map.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": "<s>",
3
+ "cls_token": "<s>",
4
+ "eos_token": "</s>",
5
+ "mask_token": {
6
+ "content": "<mask>",
7
+ "lstrip": true,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false
11
+ },
12
+ "pad_token": "<pad>",
13
+ "sep_token": "</s>",
14
+ "unk_token": "<unk>"
15
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ffb37461c391f096759f4a9bbbc329da0f36952f88bab061fcf84940c022e98
3
+ size 17082999
tokenizer_config.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<s>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "</s>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "250001": {
36
+ "content": "<mask>",
37
+ "lstrip": true,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "bos_token": "<s>",
45
+ "clean_up_tokenization_spaces": false,
46
+ "cls_token": "<s>",
47
+ "eos_token": "</s>",
48
+ "extra_special_tokens": {},
49
+ "mask_token": "<mask>",
50
+ "model_max_length": 512,
51
+ "pad_token": "<pad>",
52
+ "sep_token": "</s>",
53
+ "tokenizer_class": "XLMRobertaTokenizer",
54
+ "unk_token": "<unk>"
55
+ }