chikit2077 commited on
Commit
15e2a51
·
verified ·
1 Parent(s): 966166f

Upload tokenizer

Browse files
Files changed (4) hide show
  1. special_tokens_map.json +7 -0
  2. tokenizer.json +780 -0
  3. tokenizer_config.json +57 -0
  4. vocab.txt +630 -0
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer.json ADDED
@@ -0,0 +1,780 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "version": "1.0",
3
+ "truncation": null,
4
+ "padding": null,
5
+ "added_tokens": [
6
+ {
7
+ "id": 625,
8
+ "content": "[MASK]",
9
+ "single_word": false,
10
+ "lstrip": false,
11
+ "rstrip": false,
12
+ "normalized": false,
13
+ "special": true
14
+ },
15
+ {
16
+ "id": 626,
17
+ "content": "[CLS]",
18
+ "single_word": false,
19
+ "lstrip": false,
20
+ "rstrip": false,
21
+ "normalized": false,
22
+ "special": true
23
+ },
24
+ {
25
+ "id": 627,
26
+ "content": "[PAD]",
27
+ "single_word": false,
28
+ "lstrip": false,
29
+ "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
+ },
33
+ {
34
+ "id": 628,
35
+ "content": "[SEP]",
36
+ "single_word": false,
37
+ "lstrip": false,
38
+ "rstrip": false,
39
+ "normalized": false,
40
+ "special": true
41
+ },
42
+ {
43
+ "id": 629,
44
+ "content": "[UNK]",
45
+ "single_word": false,
46
+ "lstrip": false,
47
+ "rstrip": false,
48
+ "normalized": false,
49
+ "special": true
50
+ }
51
+ ],
52
+ "normalizer": {
53
+ "type": "BertNormalizer",
54
+ "clean_text": true,
55
+ "handle_chinese_chars": true,
56
+ "strip_accents": null,
57
+ "lowercase": false
58
+ },
59
+ "pre_tokenizer": {
60
+ "type": "BertPreTokenizer"
61
+ },
62
+ "post_processor": {
63
+ "type": "TemplateProcessing",
64
+ "single": [
65
+ {
66
+ "SpecialToken": {
67
+ "id": "[CLS]",
68
+ "type_id": 0
69
+ }
70
+ },
71
+ {
72
+ "Sequence": {
73
+ "id": "A",
74
+ "type_id": 0
75
+ }
76
+ },
77
+ {
78
+ "SpecialToken": {
79
+ "id": "[SEP]",
80
+ "type_id": 0
81
+ }
82
+ }
83
+ ],
84
+ "pair": [
85
+ {
86
+ "SpecialToken": {
87
+ "id": "[CLS]",
88
+ "type_id": 0
89
+ }
90
+ },
91
+ {
92
+ "Sequence": {
93
+ "id": "A",
94
+ "type_id": 0
95
+ }
96
+ },
97
+ {
98
+ "SpecialToken": {
99
+ "id": "[SEP]",
100
+ "type_id": 0
101
+ }
102
+ },
103
+ {
104
+ "Sequence": {
105
+ "id": "B",
106
+ "type_id": 1
107
+ }
108
+ },
109
+ {
110
+ "SpecialToken": {
111
+ "id": "[SEP]",
112
+ "type_id": 1
113
+ }
114
+ }
115
+ ],
116
+ "special_tokens": {
117
+ "[CLS]": {
118
+ "id": "[CLS]",
119
+ "ids": [
120
+ 626
121
+ ],
122
+ "tokens": [
123
+ "[CLS]"
124
+ ]
125
+ },
126
+ "[SEP]": {
127
+ "id": "[SEP]",
128
+ "ids": [
129
+ 628
130
+ ],
131
+ "tokens": [
132
+ "[SEP]"
133
+ ]
134
+ }
135
+ }
136
+ },
137
+ "decoder": {
138
+ "type": "WordPiece",
139
+ "prefix": "##",
140
+ "cleanup": true
141
+ },
142
+ "model": {
143
+ "type": "WordPiece",
144
+ "unk_token": "[UNK]",
145
+ "continuing_subword_prefix": "##",
146
+ "max_input_chars_per_word": 100,
147
+ "vocab": {
148
+ "AAAA": 0,
149
+ "AAAT": 1,
150
+ "AAAC": 2,
151
+ "AAAG": 3,
152
+ "AAAN": 4,
153
+ "AATA": 5,
154
+ "AATT": 6,
155
+ "AATC": 7,
156
+ "AATG": 8,
157
+ "AATN": 9,
158
+ "AACA": 10,
159
+ "AACT": 11,
160
+ "AACC": 12,
161
+ "AACG": 13,
162
+ "AACN": 14,
163
+ "AAGA": 15,
164
+ "AAGT": 16,
165
+ "AAGC": 17,
166
+ "AAGG": 18,
167
+ "AAGN": 19,
168
+ "AANA": 20,
169
+ "AANT": 21,
170
+ "AANC": 22,
171
+ "AANG": 23,
172
+ "AANN": 24,
173
+ "ATAA": 25,
174
+ "ATAT": 26,
175
+ "ATAC": 27,
176
+ "ATAG": 28,
177
+ "ATAN": 29,
178
+ "ATTA": 30,
179
+ "ATTT": 31,
180
+ "ATTC": 32,
181
+ "ATTG": 33,
182
+ "ATTN": 34,
183
+ "ATCA": 35,
184
+ "ATCT": 36,
185
+ "ATCC": 37,
186
+ "ATCG": 38,
187
+ "ATCN": 39,
188
+ "ATGA": 40,
189
+ "ATGT": 41,
190
+ "ATGC": 42,
191
+ "ATGG": 43,
192
+ "ATGN": 44,
193
+ "ATNA": 45,
194
+ "ATNT": 46,
195
+ "ATNC": 47,
196
+ "ATNG": 48,
197
+ "ATNN": 49,
198
+ "ACAA": 50,
199
+ "ACAT": 51,
200
+ "ACAC": 52,
201
+ "ACAG": 53,
202
+ "ACAN": 54,
203
+ "ACTA": 55,
204
+ "ACTT": 56,
205
+ "ACTC": 57,
206
+ "ACTG": 58,
207
+ "ACTN": 59,
208
+ "ACCA": 60,
209
+ "ACCT": 61,
210
+ "ACCC": 62,
211
+ "ACCG": 63,
212
+ "ACCN": 64,
213
+ "ACGA": 65,
214
+ "ACGT": 66,
215
+ "ACGC": 67,
216
+ "ACGG": 68,
217
+ "ACGN": 69,
218
+ "ACNA": 70,
219
+ "ACNT": 71,
220
+ "ACNC": 72,
221
+ "ACNG": 73,
222
+ "ACNN": 74,
223
+ "AGAA": 75,
224
+ "AGAT": 76,
225
+ "AGAC": 77,
226
+ "AGAG": 78,
227
+ "AGAN": 79,
228
+ "AGTA": 80,
229
+ "AGTT": 81,
230
+ "AGTC": 82,
231
+ "AGTG": 83,
232
+ "AGTN": 84,
233
+ "AGCA": 85,
234
+ "AGCT": 86,
235
+ "AGCC": 87,
236
+ "AGCG": 88,
237
+ "AGCN": 89,
238
+ "AGGA": 90,
239
+ "AGGT": 91,
240
+ "AGGC": 92,
241
+ "AGGG": 93,
242
+ "AGGN": 94,
243
+ "AGNA": 95,
244
+ "AGNT": 96,
245
+ "AGNC": 97,
246
+ "AGNG": 98,
247
+ "AGNN": 99,
248
+ "ANAA": 100,
249
+ "ANAT": 101,
250
+ "ANAC": 102,
251
+ "ANAG": 103,
252
+ "ANAN": 104,
253
+ "ANTA": 105,
254
+ "ANTT": 106,
255
+ "ANTC": 107,
256
+ "ANTG": 108,
257
+ "ANTN": 109,
258
+ "ANCA": 110,
259
+ "ANCT": 111,
260
+ "ANCC": 112,
261
+ "ANCG": 113,
262
+ "ANCN": 114,
263
+ "ANGA": 115,
264
+ "ANGT": 116,
265
+ "ANGC": 117,
266
+ "ANGG": 118,
267
+ "ANGN": 119,
268
+ "ANNA": 120,
269
+ "ANNT": 121,
270
+ "ANNC": 122,
271
+ "ANNG": 123,
272
+ "ANNN": 124,
273
+ "TAAA": 125,
274
+ "TAAT": 126,
275
+ "TAAC": 127,
276
+ "TAAG": 128,
277
+ "TAAN": 129,
278
+ "TATA": 130,
279
+ "TATT": 131,
280
+ "TATC": 132,
281
+ "TATG": 133,
282
+ "TATN": 134,
283
+ "TACA": 135,
284
+ "TACT": 136,
285
+ "TACC": 137,
286
+ "TACG": 138,
287
+ "TACN": 139,
288
+ "TAGA": 140,
289
+ "TAGT": 141,
290
+ "TAGC": 142,
291
+ "TAGG": 143,
292
+ "TAGN": 144,
293
+ "TANA": 145,
294
+ "TANT": 146,
295
+ "TANC": 147,
296
+ "TANG": 148,
297
+ "TANN": 149,
298
+ "TTAA": 150,
299
+ "TTAT": 151,
300
+ "TTAC": 152,
301
+ "TTAG": 153,
302
+ "TTAN": 154,
303
+ "TTTA": 155,
304
+ "TTTT": 156,
305
+ "TTTC": 157,
306
+ "TTTG": 158,
307
+ "TTTN": 159,
308
+ "TTCA": 160,
309
+ "TTCT": 161,
310
+ "TTCC": 162,
311
+ "TTCG": 163,
312
+ "TTCN": 164,
313
+ "TTGA": 165,
314
+ "TTGT": 166,
315
+ "TTGC": 167,
316
+ "TTGG": 168,
317
+ "TTGN": 169,
318
+ "TTNA": 170,
319
+ "TTNT": 171,
320
+ "TTNC": 172,
321
+ "TTNG": 173,
322
+ "TTNN": 174,
323
+ "TCAA": 175,
324
+ "TCAT": 176,
325
+ "TCAC": 177,
326
+ "TCAG": 178,
327
+ "TCAN": 179,
328
+ "TCTA": 180,
329
+ "TCTT": 181,
330
+ "TCTC": 182,
331
+ "TCTG": 183,
332
+ "TCTN": 184,
333
+ "TCCA": 185,
334
+ "TCCT": 186,
335
+ "TCCC": 187,
336
+ "TCCG": 188,
337
+ "TCCN": 189,
338
+ "TCGA": 190,
339
+ "TCGT": 191,
340
+ "TCGC": 192,
341
+ "TCGG": 193,
342
+ "TCGN": 194,
343
+ "TCNA": 195,
344
+ "TCNT": 196,
345
+ "TCNC": 197,
346
+ "TCNG": 198,
347
+ "TCNN": 199,
348
+ "TGAA": 200,
349
+ "TGAT": 201,
350
+ "TGAC": 202,
351
+ "TGAG": 203,
352
+ "TGAN": 204,
353
+ "TGTA": 205,
354
+ "TGTT": 206,
355
+ "TGTC": 207,
356
+ "TGTG": 208,
357
+ "TGTN": 209,
358
+ "TGCA": 210,
359
+ "TGCT": 211,
360
+ "TGCC": 212,
361
+ "TGCG": 213,
362
+ "TGCN": 214,
363
+ "TGGA": 215,
364
+ "TGGT": 216,
365
+ "TGGC": 217,
366
+ "TGGG": 218,
367
+ "TGGN": 219,
368
+ "TGNA": 220,
369
+ "TGNT": 221,
370
+ "TGNC": 222,
371
+ "TGNG": 223,
372
+ "TGNN": 224,
373
+ "TNAA": 225,
374
+ "TNAT": 226,
375
+ "TNAC": 227,
376
+ "TNAG": 228,
377
+ "TNAN": 229,
378
+ "TNTA": 230,
379
+ "TNTT": 231,
380
+ "TNTC": 232,
381
+ "TNTG": 233,
382
+ "TNTN": 234,
383
+ "TNCA": 235,
384
+ "TNCT": 236,
385
+ "TNCC": 237,
386
+ "TNCG": 238,
387
+ "TNCN": 239,
388
+ "TNGA": 240,
389
+ "TNGT": 241,
390
+ "TNGC": 242,
391
+ "TNGG": 243,
392
+ "TNGN": 244,
393
+ "TNNA": 245,
394
+ "TNNT": 246,
395
+ "TNNC": 247,
396
+ "TNNG": 248,
397
+ "TNNN": 249,
398
+ "CAAA": 250,
399
+ "CAAT": 251,
400
+ "CAAC": 252,
401
+ "CAAG": 253,
402
+ "CAAN": 254,
403
+ "CATA": 255,
404
+ "CATT": 256,
405
+ "CATC": 257,
406
+ "CATG": 258,
407
+ "CATN": 259,
408
+ "CACA": 260,
409
+ "CACT": 261,
410
+ "CACC": 262,
411
+ "CACG": 263,
412
+ "CACN": 264,
413
+ "CAGA": 265,
414
+ "CAGT": 266,
415
+ "CAGC": 267,
416
+ "CAGG": 268,
417
+ "CAGN": 269,
418
+ "CANA": 270,
419
+ "CANT": 271,
420
+ "CANC": 272,
421
+ "CANG": 273,
422
+ "CANN": 274,
423
+ "CTAA": 275,
424
+ "CTAT": 276,
425
+ "CTAC": 277,
426
+ "CTAG": 278,
427
+ "CTAN": 279,
428
+ "CTTA": 280,
429
+ "CTTT": 281,
430
+ "CTTC": 282,
431
+ "CTTG": 283,
432
+ "CTTN": 284,
433
+ "CTCA": 285,
434
+ "CTCT": 286,
435
+ "CTCC": 287,
436
+ "CTCG": 288,
437
+ "CTCN": 289,
438
+ "CTGA": 290,
439
+ "CTGT": 291,
440
+ "CTGC": 292,
441
+ "CTGG": 293,
442
+ "CTGN": 294,
443
+ "CTNA": 295,
444
+ "CTNT": 296,
445
+ "CTNC": 297,
446
+ "CTNG": 298,
447
+ "CTNN": 299,
448
+ "CCAA": 300,
449
+ "CCAT": 301,
450
+ "CCAC": 302,
451
+ "CCAG": 303,
452
+ "CCAN": 304,
453
+ "CCTA": 305,
454
+ "CCTT": 306,
455
+ "CCTC": 307,
456
+ "CCTG": 308,
457
+ "CCTN": 309,
458
+ "CCCA": 310,
459
+ "CCCT": 311,
460
+ "CCCC": 312,
461
+ "CCCG": 313,
462
+ "CCCN": 314,
463
+ "CCGA": 315,
464
+ "CCGT": 316,
465
+ "CCGC": 317,
466
+ "CCGG": 318,
467
+ "CCGN": 319,
468
+ "CCNA": 320,
469
+ "CCNT": 321,
470
+ "CCNC": 322,
471
+ "CCNG": 323,
472
+ "CCNN": 324,
473
+ "CGAA": 325,
474
+ "CGAT": 326,
475
+ "CGAC": 327,
476
+ "CGAG": 328,
477
+ "CGAN": 329,
478
+ "CGTA": 330,
479
+ "CGTT": 331,
480
+ "CGTC": 332,
481
+ "CGTG": 333,
482
+ "CGTN": 334,
483
+ "CGCA": 335,
484
+ "CGCT": 336,
485
+ "CGCC": 337,
486
+ "CGCG": 338,
487
+ "CGCN": 339,
488
+ "CGGA": 340,
489
+ "CGGT": 341,
490
+ "CGGC": 342,
491
+ "CGGG": 343,
492
+ "CGGN": 344,
493
+ "CGNA": 345,
494
+ "CGNT": 346,
495
+ "CGNC": 347,
496
+ "CGNG": 348,
497
+ "CGNN": 349,
498
+ "CNAA": 350,
499
+ "CNAT": 351,
500
+ "CNAC": 352,
501
+ "CNAG": 353,
502
+ "CNAN": 354,
503
+ "CNTA": 355,
504
+ "CNTT": 356,
505
+ "CNTC": 357,
506
+ "CNTG": 358,
507
+ "CNTN": 359,
508
+ "CNCA": 360,
509
+ "CNCT": 361,
510
+ "CNCC": 362,
511
+ "CNCG": 363,
512
+ "CNCN": 364,
513
+ "CNGA": 365,
514
+ "CNGT": 366,
515
+ "CNGC": 367,
516
+ "CNGG": 368,
517
+ "CNGN": 369,
518
+ "CNNA": 370,
519
+ "CNNT": 371,
520
+ "CNNC": 372,
521
+ "CNNG": 373,
522
+ "CNNN": 374,
523
+ "GAAA": 375,
524
+ "GAAT": 376,
525
+ "GAAC": 377,
526
+ "GAAG": 378,
527
+ "GAAN": 379,
528
+ "GATA": 380,
529
+ "GATT": 381,
530
+ "GATC": 382,
531
+ "GATG": 383,
532
+ "GATN": 384,
533
+ "GACA": 385,
534
+ "GACT": 386,
535
+ "GACC": 387,
536
+ "GACG": 388,
537
+ "GACN": 389,
538
+ "GAGA": 390,
539
+ "GAGT": 391,
540
+ "GAGC": 392,
541
+ "GAGG": 393,
542
+ "GAGN": 394,
543
+ "GANA": 395,
544
+ "GANT": 396,
545
+ "GANC": 397,
546
+ "GANG": 398,
547
+ "GANN": 399,
548
+ "GTAA": 400,
549
+ "GTAT": 401,
550
+ "GTAC": 402,
551
+ "GTAG": 403,
552
+ "GTAN": 404,
553
+ "GTTA": 405,
554
+ "GTTT": 406,
555
+ "GTTC": 407,
556
+ "GTTG": 408,
557
+ "GTTN": 409,
558
+ "GTCA": 410,
559
+ "GTCT": 411,
560
+ "GTCC": 412,
561
+ "GTCG": 413,
562
+ "GTCN": 414,
563
+ "GTGA": 415,
564
+ "GTGT": 416,
565
+ "GTGC": 417,
566
+ "GTGG": 418,
567
+ "GTGN": 419,
568
+ "GTNA": 420,
569
+ "GTNT": 421,
570
+ "GTNC": 422,
571
+ "GTNG": 423,
572
+ "GTNN": 424,
573
+ "GCAA": 425,
574
+ "GCAT": 426,
575
+ "GCAC": 427,
576
+ "GCAG": 428,
577
+ "GCAN": 429,
578
+ "GCTA": 430,
579
+ "GCTT": 431,
580
+ "GCTC": 432,
581
+ "GCTG": 433,
582
+ "GCTN": 434,
583
+ "GCCA": 435,
584
+ "GCCT": 436,
585
+ "GCCC": 437,
586
+ "GCCG": 438,
587
+ "GCCN": 439,
588
+ "GCGA": 440,
589
+ "GCGT": 441,
590
+ "GCGC": 442,
591
+ "GCGG": 443,
592
+ "GCGN": 444,
593
+ "GCNA": 445,
594
+ "GCNT": 446,
595
+ "GCNC": 447,
596
+ "GCNG": 448,
597
+ "GCNN": 449,
598
+ "GGAA": 450,
599
+ "GGAT": 451,
600
+ "GGAC": 452,
601
+ "GGAG": 453,
602
+ "GGAN": 454,
603
+ "GGTA": 455,
604
+ "GGTT": 456,
605
+ "GGTC": 457,
606
+ "GGTG": 458,
607
+ "GGTN": 459,
608
+ "GGCA": 460,
609
+ "GGCT": 461,
610
+ "GGCC": 462,
611
+ "GGCG": 463,
612
+ "GGCN": 464,
613
+ "GGGA": 465,
614
+ "GGGT": 466,
615
+ "GGGC": 467,
616
+ "GGGG": 468,
617
+ "GGGN": 469,
618
+ "GGNA": 470,
619
+ "GGNT": 471,
620
+ "GGNC": 472,
621
+ "GGNG": 473,
622
+ "GGNN": 474,
623
+ "GNAA": 475,
624
+ "GNAT": 476,
625
+ "GNAC": 477,
626
+ "GNAG": 478,
627
+ "GNAN": 479,
628
+ "GNTA": 480,
629
+ "GNTT": 481,
630
+ "GNTC": 482,
631
+ "GNTG": 483,
632
+ "GNTN": 484,
633
+ "GNCA": 485,
634
+ "GNCT": 486,
635
+ "GNCC": 487,
636
+ "GNCG": 488,
637
+ "GNCN": 489,
638
+ "GNGA": 490,
639
+ "GNGT": 491,
640
+ "GNGC": 492,
641
+ "GNGG": 493,
642
+ "GNGN": 494,
643
+ "GNNA": 495,
644
+ "GNNT": 496,
645
+ "GNNC": 497,
646
+ "GNNG": 498,
647
+ "GNNN": 499,
648
+ "NAAA": 500,
649
+ "NAAT": 501,
650
+ "NAAC": 502,
651
+ "NAAG": 503,
652
+ "NAAN": 504,
653
+ "NATA": 505,
654
+ "NATT": 506,
655
+ "NATC": 507,
656
+ "NATG": 508,
657
+ "NATN": 509,
658
+ "NACA": 510,
659
+ "NACT": 511,
660
+ "NACC": 512,
661
+ "NACG": 513,
662
+ "NACN": 514,
663
+ "NAGA": 515,
664
+ "NAGT": 516,
665
+ "NAGC": 517,
666
+ "NAGG": 518,
667
+ "NAGN": 519,
668
+ "NANA": 520,
669
+ "NANT": 521,
670
+ "NANC": 522,
671
+ "NANG": 523,
672
+ "NANN": 524,
673
+ "NTAA": 525,
674
+ "NTAT": 526,
675
+ "NTAC": 527,
676
+ "NTAG": 528,
677
+ "NTAN": 529,
678
+ "NTTA": 530,
679
+ "NTTT": 531,
680
+ "NTTC": 532,
681
+ "NTTG": 533,
682
+ "NTTN": 534,
683
+ "NTCA": 535,
684
+ "NTCT": 536,
685
+ "NTCC": 537,
686
+ "NTCG": 538,
687
+ "NTCN": 539,
688
+ "NTGA": 540,
689
+ "NTGT": 541,
690
+ "NTGC": 542,
691
+ "NTGG": 543,
692
+ "NTGN": 544,
693
+ "NTNA": 545,
694
+ "NTNT": 546,
695
+ "NTNC": 547,
696
+ "NTNG": 548,
697
+ "NTNN": 549,
698
+ "NCAA": 550,
699
+ "NCAT": 551,
700
+ "NCAC": 552,
701
+ "NCAG": 553,
702
+ "NCAN": 554,
703
+ "NCTA": 555,
704
+ "NCTT": 556,
705
+ "NCTC": 557,
706
+ "NCTG": 558,
707
+ "NCTN": 559,
708
+ "NCCA": 560,
709
+ "NCCT": 561,
710
+ "NCCC": 562,
711
+ "NCCG": 563,
712
+ "NCCN": 564,
713
+ "NCGA": 565,
714
+ "NCGT": 566,
715
+ "NCGC": 567,
716
+ "NCGG": 568,
717
+ "NCGN": 569,
718
+ "NCNA": 570,
719
+ "NCNT": 571,
720
+ "NCNC": 572,
721
+ "NCNG": 573,
722
+ "NCNN": 574,
723
+ "NGAA": 575,
724
+ "NGAT": 576,
725
+ "NGAC": 577,
726
+ "NGAG": 578,
727
+ "NGAN": 579,
728
+ "NGTA": 580,
729
+ "NGTT": 581,
730
+ "NGTC": 582,
731
+ "NGTG": 583,
732
+ "NGTN": 584,
733
+ "NGCA": 585,
734
+ "NGCT": 586,
735
+ "NGCC": 587,
736
+ "NGCG": 588,
737
+ "NGCN": 589,
738
+ "NGGA": 590,
739
+ "NGGT": 591,
740
+ "NGGC": 592,
741
+ "NGGG": 593,
742
+ "NGGN": 594,
743
+ "NGNA": 595,
744
+ "NGNT": 596,
745
+ "NGNC": 597,
746
+ "NGNG": 598,
747
+ "NGNN": 599,
748
+ "NNAA": 600,
749
+ "NNAT": 601,
750
+ "NNAC": 602,
751
+ "NNAG": 603,
752
+ "NNAN": 604,
753
+ "NNTA": 605,
754
+ "NNTT": 606,
755
+ "NNTC": 607,
756
+ "NNTG": 608,
757
+ "NNTN": 609,
758
+ "NNCA": 610,
759
+ "NNCT": 611,
760
+ "NNCC": 612,
761
+ "NNCG": 613,
762
+ "NNCN": 614,
763
+ "NNGA": 615,
764
+ "NNGT": 616,
765
+ "NNGC": 617,
766
+ "NNGG": 618,
767
+ "NNGN": 619,
768
+ "NNNA": 620,
769
+ "NNNT": 621,
770
+ "NNNC": 622,
771
+ "NNNG": 623,
772
+ "NNNN": 624,
773
+ "[MASK]": 625,
774
+ "[CLS]": 626,
775
+ "[PAD]": 627,
776
+ "[SEP]": 628,
777
+ "[UNK]": 629
778
+ }
779
+ }
780
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "625": {
4
+ "content": "[MASK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "626": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "627": {
20
+ "content": "[PAD]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "628": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "629": {
36
+ "content": "[UNK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": false,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 512,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
vocab.txt ADDED
@@ -0,0 +1,630 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ AAAA
2
+ AAAT
3
+ AAAC
4
+ AAAG
5
+ AAAN
6
+ AATA
7
+ AATT
8
+ AATC
9
+ AATG
10
+ AATN
11
+ AACA
12
+ AACT
13
+ AACC
14
+ AACG
15
+ AACN
16
+ AAGA
17
+ AAGT
18
+ AAGC
19
+ AAGG
20
+ AAGN
21
+ AANA
22
+ AANT
23
+ AANC
24
+ AANG
25
+ AANN
26
+ ATAA
27
+ ATAT
28
+ ATAC
29
+ ATAG
30
+ ATAN
31
+ ATTA
32
+ ATTT
33
+ ATTC
34
+ ATTG
35
+ ATTN
36
+ ATCA
37
+ ATCT
38
+ ATCC
39
+ ATCG
40
+ ATCN
41
+ ATGA
42
+ ATGT
43
+ ATGC
44
+ ATGG
45
+ ATGN
46
+ ATNA
47
+ ATNT
48
+ ATNC
49
+ ATNG
50
+ ATNN
51
+ ACAA
52
+ ACAT
53
+ ACAC
54
+ ACAG
55
+ ACAN
56
+ ACTA
57
+ ACTT
58
+ ACTC
59
+ ACTG
60
+ ACTN
61
+ ACCA
62
+ ACCT
63
+ ACCC
64
+ ACCG
65
+ ACCN
66
+ ACGA
67
+ ACGT
68
+ ACGC
69
+ ACGG
70
+ ACGN
71
+ ACNA
72
+ ACNT
73
+ ACNC
74
+ ACNG
75
+ ACNN
76
+ AGAA
77
+ AGAT
78
+ AGAC
79
+ AGAG
80
+ AGAN
81
+ AGTA
82
+ AGTT
83
+ AGTC
84
+ AGTG
85
+ AGTN
86
+ AGCA
87
+ AGCT
88
+ AGCC
89
+ AGCG
90
+ AGCN
91
+ AGGA
92
+ AGGT
93
+ AGGC
94
+ AGGG
95
+ AGGN
96
+ AGNA
97
+ AGNT
98
+ AGNC
99
+ AGNG
100
+ AGNN
101
+ ANAA
102
+ ANAT
103
+ ANAC
104
+ ANAG
105
+ ANAN
106
+ ANTA
107
+ ANTT
108
+ ANTC
109
+ ANTG
110
+ ANTN
111
+ ANCA
112
+ ANCT
113
+ ANCC
114
+ ANCG
115
+ ANCN
116
+ ANGA
117
+ ANGT
118
+ ANGC
119
+ ANGG
120
+ ANGN
121
+ ANNA
122
+ ANNT
123
+ ANNC
124
+ ANNG
125
+ ANNN
126
+ TAAA
127
+ TAAT
128
+ TAAC
129
+ TAAG
130
+ TAAN
131
+ TATA
132
+ TATT
133
+ TATC
134
+ TATG
135
+ TATN
136
+ TACA
137
+ TACT
138
+ TACC
139
+ TACG
140
+ TACN
141
+ TAGA
142
+ TAGT
143
+ TAGC
144
+ TAGG
145
+ TAGN
146
+ TANA
147
+ TANT
148
+ TANC
149
+ TANG
150
+ TANN
151
+ TTAA
152
+ TTAT
153
+ TTAC
154
+ TTAG
155
+ TTAN
156
+ TTTA
157
+ TTTT
158
+ TTTC
159
+ TTTG
160
+ TTTN
161
+ TTCA
162
+ TTCT
163
+ TTCC
164
+ TTCG
165
+ TTCN
166
+ TTGA
167
+ TTGT
168
+ TTGC
169
+ TTGG
170
+ TTGN
171
+ TTNA
172
+ TTNT
173
+ TTNC
174
+ TTNG
175
+ TTNN
176
+ TCAA
177
+ TCAT
178
+ TCAC
179
+ TCAG
180
+ TCAN
181
+ TCTA
182
+ TCTT
183
+ TCTC
184
+ TCTG
185
+ TCTN
186
+ TCCA
187
+ TCCT
188
+ TCCC
189
+ TCCG
190
+ TCCN
191
+ TCGA
192
+ TCGT
193
+ TCGC
194
+ TCGG
195
+ TCGN
196
+ TCNA
197
+ TCNT
198
+ TCNC
199
+ TCNG
200
+ TCNN
201
+ TGAA
202
+ TGAT
203
+ TGAC
204
+ TGAG
205
+ TGAN
206
+ TGTA
207
+ TGTT
208
+ TGTC
209
+ TGTG
210
+ TGTN
211
+ TGCA
212
+ TGCT
213
+ TGCC
214
+ TGCG
215
+ TGCN
216
+ TGGA
217
+ TGGT
218
+ TGGC
219
+ TGGG
220
+ TGGN
221
+ TGNA
222
+ TGNT
223
+ TGNC
224
+ TGNG
225
+ TGNN
226
+ TNAA
227
+ TNAT
228
+ TNAC
229
+ TNAG
230
+ TNAN
231
+ TNTA
232
+ TNTT
233
+ TNTC
234
+ TNTG
235
+ TNTN
236
+ TNCA
237
+ TNCT
238
+ TNCC
239
+ TNCG
240
+ TNCN
241
+ TNGA
242
+ TNGT
243
+ TNGC
244
+ TNGG
245
+ TNGN
246
+ TNNA
247
+ TNNT
248
+ TNNC
249
+ TNNG
250
+ TNNN
251
+ CAAA
252
+ CAAT
253
+ CAAC
254
+ CAAG
255
+ CAAN
256
+ CATA
257
+ CATT
258
+ CATC
259
+ CATG
260
+ CATN
261
+ CACA
262
+ CACT
263
+ CACC
264
+ CACG
265
+ CACN
266
+ CAGA
267
+ CAGT
268
+ CAGC
269
+ CAGG
270
+ CAGN
271
+ CANA
272
+ CANT
273
+ CANC
274
+ CANG
275
+ CANN
276
+ CTAA
277
+ CTAT
278
+ CTAC
279
+ CTAG
280
+ CTAN
281
+ CTTA
282
+ CTTT
283
+ CTTC
284
+ CTTG
285
+ CTTN
286
+ CTCA
287
+ CTCT
288
+ CTCC
289
+ CTCG
290
+ CTCN
291
+ CTGA
292
+ CTGT
293
+ CTGC
294
+ CTGG
295
+ CTGN
296
+ CTNA
297
+ CTNT
298
+ CTNC
299
+ CTNG
300
+ CTNN
301
+ CCAA
302
+ CCAT
303
+ CCAC
304
+ CCAG
305
+ CCAN
306
+ CCTA
307
+ CCTT
308
+ CCTC
309
+ CCTG
310
+ CCTN
311
+ CCCA
312
+ CCCT
313
+ CCCC
314
+ CCCG
315
+ CCCN
316
+ CCGA
317
+ CCGT
318
+ CCGC
319
+ CCGG
320
+ CCGN
321
+ CCNA
322
+ CCNT
323
+ CCNC
324
+ CCNG
325
+ CCNN
326
+ CGAA
327
+ CGAT
328
+ CGAC
329
+ CGAG
330
+ CGAN
331
+ CGTA
332
+ CGTT
333
+ CGTC
334
+ CGTG
335
+ CGTN
336
+ CGCA
337
+ CGCT
338
+ CGCC
339
+ CGCG
340
+ CGCN
341
+ CGGA
342
+ CGGT
343
+ CGGC
344
+ CGGG
345
+ CGGN
346
+ CGNA
347
+ CGNT
348
+ CGNC
349
+ CGNG
350
+ CGNN
351
+ CNAA
352
+ CNAT
353
+ CNAC
354
+ CNAG
355
+ CNAN
356
+ CNTA
357
+ CNTT
358
+ CNTC
359
+ CNTG
360
+ CNTN
361
+ CNCA
362
+ CNCT
363
+ CNCC
364
+ CNCG
365
+ CNCN
366
+ CNGA
367
+ CNGT
368
+ CNGC
369
+ CNGG
370
+ CNGN
371
+ CNNA
372
+ CNNT
373
+ CNNC
374
+ CNNG
375
+ CNNN
376
+ GAAA
377
+ GAAT
378
+ GAAC
379
+ GAAG
380
+ GAAN
381
+ GATA
382
+ GATT
383
+ GATC
384
+ GATG
385
+ GATN
386
+ GACA
387
+ GACT
388
+ GACC
389
+ GACG
390
+ GACN
391
+ GAGA
392
+ GAGT
393
+ GAGC
394
+ GAGG
395
+ GAGN
396
+ GANA
397
+ GANT
398
+ GANC
399
+ GANG
400
+ GANN
401
+ GTAA
402
+ GTAT
403
+ GTAC
404
+ GTAG
405
+ GTAN
406
+ GTTA
407
+ GTTT
408
+ GTTC
409
+ GTTG
410
+ GTTN
411
+ GTCA
412
+ GTCT
413
+ GTCC
414
+ GTCG
415
+ GTCN
416
+ GTGA
417
+ GTGT
418
+ GTGC
419
+ GTGG
420
+ GTGN
421
+ GTNA
422
+ GTNT
423
+ GTNC
424
+ GTNG
425
+ GTNN
426
+ GCAA
427
+ GCAT
428
+ GCAC
429
+ GCAG
430
+ GCAN
431
+ GCTA
432
+ GCTT
433
+ GCTC
434
+ GCTG
435
+ GCTN
436
+ GCCA
437
+ GCCT
438
+ GCCC
439
+ GCCG
440
+ GCCN
441
+ GCGA
442
+ GCGT
443
+ GCGC
444
+ GCGG
445
+ GCGN
446
+ GCNA
447
+ GCNT
448
+ GCNC
449
+ GCNG
450
+ GCNN
451
+ GGAA
452
+ GGAT
453
+ GGAC
454
+ GGAG
455
+ GGAN
456
+ GGTA
457
+ GGTT
458
+ GGTC
459
+ GGTG
460
+ GGTN
461
+ GGCA
462
+ GGCT
463
+ GGCC
464
+ GGCG
465
+ GGCN
466
+ GGGA
467
+ GGGT
468
+ GGGC
469
+ GGGG
470
+ GGGN
471
+ GGNA
472
+ GGNT
473
+ GGNC
474
+ GGNG
475
+ GGNN
476
+ GNAA
477
+ GNAT
478
+ GNAC
479
+ GNAG
480
+ GNAN
481
+ GNTA
482
+ GNTT
483
+ GNTC
484
+ GNTG
485
+ GNTN
486
+ GNCA
487
+ GNCT
488
+ GNCC
489
+ GNCG
490
+ GNCN
491
+ GNGA
492
+ GNGT
493
+ GNGC
494
+ GNGG
495
+ GNGN
496
+ GNNA
497
+ GNNT
498
+ GNNC
499
+ GNNG
500
+ GNNN
501
+ NAAA
502
+ NAAT
503
+ NAAC
504
+ NAAG
505
+ NAAN
506
+ NATA
507
+ NATT
508
+ NATC
509
+ NATG
510
+ NATN
511
+ NACA
512
+ NACT
513
+ NACC
514
+ NACG
515
+ NACN
516
+ NAGA
517
+ NAGT
518
+ NAGC
519
+ NAGG
520
+ NAGN
521
+ NANA
522
+ NANT
523
+ NANC
524
+ NANG
525
+ NANN
526
+ NTAA
527
+ NTAT
528
+ NTAC
529
+ NTAG
530
+ NTAN
531
+ NTTA
532
+ NTTT
533
+ NTTC
534
+ NTTG
535
+ NTTN
536
+ NTCA
537
+ NTCT
538
+ NTCC
539
+ NTCG
540
+ NTCN
541
+ NTGA
542
+ NTGT
543
+ NTGC
544
+ NTGG
545
+ NTGN
546
+ NTNA
547
+ NTNT
548
+ NTNC
549
+ NTNG
550
+ NTNN
551
+ NCAA
552
+ NCAT
553
+ NCAC
554
+ NCAG
555
+ NCAN
556
+ NCTA
557
+ NCTT
558
+ NCTC
559
+ NCTG
560
+ NCTN
561
+ NCCA
562
+ NCCT
563
+ NCCC
564
+ NCCG
565
+ NCCN
566
+ NCGA
567
+ NCGT
568
+ NCGC
569
+ NCGG
570
+ NCGN
571
+ NCNA
572
+ NCNT
573
+ NCNC
574
+ NCNG
575
+ NCNN
576
+ NGAA
577
+ NGAT
578
+ NGAC
579
+ NGAG
580
+ NGAN
581
+ NGTA
582
+ NGTT
583
+ NGTC
584
+ NGTG
585
+ NGTN
586
+ NGCA
587
+ NGCT
588
+ NGCC
589
+ NGCG
590
+ NGCN
591
+ NGGA
592
+ NGGT
593
+ NGGC
594
+ NGGG
595
+ NGGN
596
+ NGNA
597
+ NGNT
598
+ NGNC
599
+ NGNG
600
+ NGNN
601
+ NNAA
602
+ NNAT
603
+ NNAC
604
+ NNAG
605
+ NNAN
606
+ NNTA
607
+ NNTT
608
+ NNTC
609
+ NNTG
610
+ NNTN
611
+ NNCA
612
+ NNCT
613
+ NNCC
614
+ NNCG
615
+ NNCN
616
+ NNGA
617
+ NNGT
618
+ NNGC
619
+ NNGG
620
+ NNGN
621
+ NNNA
622
+ NNNT
623
+ NNNC
624
+ NNNG
625
+ NNNN
626
+ [MASK]
627
+ [CLS]
628
+ [PAD]
629
+ [SEP]
630
+ [UNK]