CLAMP-4mer-500bp-pretrain / tokenizer.json
chikit2077's picture
Upload tokenizer
15e2a51 verified
raw
history blame
14.5 kB
{
"version": "1.0",
"truncation": null,
"padding": null,
"added_tokens": [
{
"id": 625,
"content": "[MASK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 626,
"content": "[CLS]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 627,
"content": "[PAD]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 628,
"content": "[SEP]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
},
{
"id": 629,
"content": "[UNK]",
"single_word": false,
"lstrip": false,
"rstrip": false,
"normalized": false,
"special": true
}
],
"normalizer": {
"type": "BertNormalizer",
"clean_text": true,
"handle_chinese_chars": true,
"strip_accents": null,
"lowercase": false
},
"pre_tokenizer": {
"type": "BertPreTokenizer"
},
"post_processor": {
"type": "TemplateProcessing",
"single": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
}
],
"pair": [
{
"SpecialToken": {
"id": "[CLS]",
"type_id": 0
}
},
{
"Sequence": {
"id": "A",
"type_id": 0
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 0
}
},
{
"Sequence": {
"id": "B",
"type_id": 1
}
},
{
"SpecialToken": {
"id": "[SEP]",
"type_id": 1
}
}
],
"special_tokens": {
"[CLS]": {
"id": "[CLS]",
"ids": [
626
],
"tokens": [
"[CLS]"
]
},
"[SEP]": {
"id": "[SEP]",
"ids": [
628
],
"tokens": [
"[SEP]"
]
}
}
},
"decoder": {
"type": "WordPiece",
"prefix": "##",
"cleanup": true
},
"model": {
"type": "WordPiece",
"unk_token": "[UNK]",
"continuing_subword_prefix": "##",
"max_input_chars_per_word": 100,
"vocab": {
"AAAA": 0,
"AAAT": 1,
"AAAC": 2,
"AAAG": 3,
"AAAN": 4,
"AATA": 5,
"AATT": 6,
"AATC": 7,
"AATG": 8,
"AATN": 9,
"AACA": 10,
"AACT": 11,
"AACC": 12,
"AACG": 13,
"AACN": 14,
"AAGA": 15,
"AAGT": 16,
"AAGC": 17,
"AAGG": 18,
"AAGN": 19,
"AANA": 20,
"AANT": 21,
"AANC": 22,
"AANG": 23,
"AANN": 24,
"ATAA": 25,
"ATAT": 26,
"ATAC": 27,
"ATAG": 28,
"ATAN": 29,
"ATTA": 30,
"ATTT": 31,
"ATTC": 32,
"ATTG": 33,
"ATTN": 34,
"ATCA": 35,
"ATCT": 36,
"ATCC": 37,
"ATCG": 38,
"ATCN": 39,
"ATGA": 40,
"ATGT": 41,
"ATGC": 42,
"ATGG": 43,
"ATGN": 44,
"ATNA": 45,
"ATNT": 46,
"ATNC": 47,
"ATNG": 48,
"ATNN": 49,
"ACAA": 50,
"ACAT": 51,
"ACAC": 52,
"ACAG": 53,
"ACAN": 54,
"ACTA": 55,
"ACTT": 56,
"ACTC": 57,
"ACTG": 58,
"ACTN": 59,
"ACCA": 60,
"ACCT": 61,
"ACCC": 62,
"ACCG": 63,
"ACCN": 64,
"ACGA": 65,
"ACGT": 66,
"ACGC": 67,
"ACGG": 68,
"ACGN": 69,
"ACNA": 70,
"ACNT": 71,
"ACNC": 72,
"ACNG": 73,
"ACNN": 74,
"AGAA": 75,
"AGAT": 76,
"AGAC": 77,
"AGAG": 78,
"AGAN": 79,
"AGTA": 80,
"AGTT": 81,
"AGTC": 82,
"AGTG": 83,
"AGTN": 84,
"AGCA": 85,
"AGCT": 86,
"AGCC": 87,
"AGCG": 88,
"AGCN": 89,
"AGGA": 90,
"AGGT": 91,
"AGGC": 92,
"AGGG": 93,
"AGGN": 94,
"AGNA": 95,
"AGNT": 96,
"AGNC": 97,
"AGNG": 98,
"AGNN": 99,
"ANAA": 100,
"ANAT": 101,
"ANAC": 102,
"ANAG": 103,
"ANAN": 104,
"ANTA": 105,
"ANTT": 106,
"ANTC": 107,
"ANTG": 108,
"ANTN": 109,
"ANCA": 110,
"ANCT": 111,
"ANCC": 112,
"ANCG": 113,
"ANCN": 114,
"ANGA": 115,
"ANGT": 116,
"ANGC": 117,
"ANGG": 118,
"ANGN": 119,
"ANNA": 120,
"ANNT": 121,
"ANNC": 122,
"ANNG": 123,
"ANNN": 124,
"TAAA": 125,
"TAAT": 126,
"TAAC": 127,
"TAAG": 128,
"TAAN": 129,
"TATA": 130,
"TATT": 131,
"TATC": 132,
"TATG": 133,
"TATN": 134,
"TACA": 135,
"TACT": 136,
"TACC": 137,
"TACG": 138,
"TACN": 139,
"TAGA": 140,
"TAGT": 141,
"TAGC": 142,
"TAGG": 143,
"TAGN": 144,
"TANA": 145,
"TANT": 146,
"TANC": 147,
"TANG": 148,
"TANN": 149,
"TTAA": 150,
"TTAT": 151,
"TTAC": 152,
"TTAG": 153,
"TTAN": 154,
"TTTA": 155,
"TTTT": 156,
"TTTC": 157,
"TTTG": 158,
"TTTN": 159,
"TTCA": 160,
"TTCT": 161,
"TTCC": 162,
"TTCG": 163,
"TTCN": 164,
"TTGA": 165,
"TTGT": 166,
"TTGC": 167,
"TTGG": 168,
"TTGN": 169,
"TTNA": 170,
"TTNT": 171,
"TTNC": 172,
"TTNG": 173,
"TTNN": 174,
"TCAA": 175,
"TCAT": 176,
"TCAC": 177,
"TCAG": 178,
"TCAN": 179,
"TCTA": 180,
"TCTT": 181,
"TCTC": 182,
"TCTG": 183,
"TCTN": 184,
"TCCA": 185,
"TCCT": 186,
"TCCC": 187,
"TCCG": 188,
"TCCN": 189,
"TCGA": 190,
"TCGT": 191,
"TCGC": 192,
"TCGG": 193,
"TCGN": 194,
"TCNA": 195,
"TCNT": 196,
"TCNC": 197,
"TCNG": 198,
"TCNN": 199,
"TGAA": 200,
"TGAT": 201,
"TGAC": 202,
"TGAG": 203,
"TGAN": 204,
"TGTA": 205,
"TGTT": 206,
"TGTC": 207,
"TGTG": 208,
"TGTN": 209,
"TGCA": 210,
"TGCT": 211,
"TGCC": 212,
"TGCG": 213,
"TGCN": 214,
"TGGA": 215,
"TGGT": 216,
"TGGC": 217,
"TGGG": 218,
"TGGN": 219,
"TGNA": 220,
"TGNT": 221,
"TGNC": 222,
"TGNG": 223,
"TGNN": 224,
"TNAA": 225,
"TNAT": 226,
"TNAC": 227,
"TNAG": 228,
"TNAN": 229,
"TNTA": 230,
"TNTT": 231,
"TNTC": 232,
"TNTG": 233,
"TNTN": 234,
"TNCA": 235,
"TNCT": 236,
"TNCC": 237,
"TNCG": 238,
"TNCN": 239,
"TNGA": 240,
"TNGT": 241,
"TNGC": 242,
"TNGG": 243,
"TNGN": 244,
"TNNA": 245,
"TNNT": 246,
"TNNC": 247,
"TNNG": 248,
"TNNN": 249,
"CAAA": 250,
"CAAT": 251,
"CAAC": 252,
"CAAG": 253,
"CAAN": 254,
"CATA": 255,
"CATT": 256,
"CATC": 257,
"CATG": 258,
"CATN": 259,
"CACA": 260,
"CACT": 261,
"CACC": 262,
"CACG": 263,
"CACN": 264,
"CAGA": 265,
"CAGT": 266,
"CAGC": 267,
"CAGG": 268,
"CAGN": 269,
"CANA": 270,
"CANT": 271,
"CANC": 272,
"CANG": 273,
"CANN": 274,
"CTAA": 275,
"CTAT": 276,
"CTAC": 277,
"CTAG": 278,
"CTAN": 279,
"CTTA": 280,
"CTTT": 281,
"CTTC": 282,
"CTTG": 283,
"CTTN": 284,
"CTCA": 285,
"CTCT": 286,
"CTCC": 287,
"CTCG": 288,
"CTCN": 289,
"CTGA": 290,
"CTGT": 291,
"CTGC": 292,
"CTGG": 293,
"CTGN": 294,
"CTNA": 295,
"CTNT": 296,
"CTNC": 297,
"CTNG": 298,
"CTNN": 299,
"CCAA": 300,
"CCAT": 301,
"CCAC": 302,
"CCAG": 303,
"CCAN": 304,
"CCTA": 305,
"CCTT": 306,
"CCTC": 307,
"CCTG": 308,
"CCTN": 309,
"CCCA": 310,
"CCCT": 311,
"CCCC": 312,
"CCCG": 313,
"CCCN": 314,
"CCGA": 315,
"CCGT": 316,
"CCGC": 317,
"CCGG": 318,
"CCGN": 319,
"CCNA": 320,
"CCNT": 321,
"CCNC": 322,
"CCNG": 323,
"CCNN": 324,
"CGAA": 325,
"CGAT": 326,
"CGAC": 327,
"CGAG": 328,
"CGAN": 329,
"CGTA": 330,
"CGTT": 331,
"CGTC": 332,
"CGTG": 333,
"CGTN": 334,
"CGCA": 335,
"CGCT": 336,
"CGCC": 337,
"CGCG": 338,
"CGCN": 339,
"CGGA": 340,
"CGGT": 341,
"CGGC": 342,
"CGGG": 343,
"CGGN": 344,
"CGNA": 345,
"CGNT": 346,
"CGNC": 347,
"CGNG": 348,
"CGNN": 349,
"CNAA": 350,
"CNAT": 351,
"CNAC": 352,
"CNAG": 353,
"CNAN": 354,
"CNTA": 355,
"CNTT": 356,
"CNTC": 357,
"CNTG": 358,
"CNTN": 359,
"CNCA": 360,
"CNCT": 361,
"CNCC": 362,
"CNCG": 363,
"CNCN": 364,
"CNGA": 365,
"CNGT": 366,
"CNGC": 367,
"CNGG": 368,
"CNGN": 369,
"CNNA": 370,
"CNNT": 371,
"CNNC": 372,
"CNNG": 373,
"CNNN": 374,
"GAAA": 375,
"GAAT": 376,
"GAAC": 377,
"GAAG": 378,
"GAAN": 379,
"GATA": 380,
"GATT": 381,
"GATC": 382,
"GATG": 383,
"GATN": 384,
"GACA": 385,
"GACT": 386,
"GACC": 387,
"GACG": 388,
"GACN": 389,
"GAGA": 390,
"GAGT": 391,
"GAGC": 392,
"GAGG": 393,
"GAGN": 394,
"GANA": 395,
"GANT": 396,
"GANC": 397,
"GANG": 398,
"GANN": 399,
"GTAA": 400,
"GTAT": 401,
"GTAC": 402,
"GTAG": 403,
"GTAN": 404,
"GTTA": 405,
"GTTT": 406,
"GTTC": 407,
"GTTG": 408,
"GTTN": 409,
"GTCA": 410,
"GTCT": 411,
"GTCC": 412,
"GTCG": 413,
"GTCN": 414,
"GTGA": 415,
"GTGT": 416,
"GTGC": 417,
"GTGG": 418,
"GTGN": 419,
"GTNA": 420,
"GTNT": 421,
"GTNC": 422,
"GTNG": 423,
"GTNN": 424,
"GCAA": 425,
"GCAT": 426,
"GCAC": 427,
"GCAG": 428,
"GCAN": 429,
"GCTA": 430,
"GCTT": 431,
"GCTC": 432,
"GCTG": 433,
"GCTN": 434,
"GCCA": 435,
"GCCT": 436,
"GCCC": 437,
"GCCG": 438,
"GCCN": 439,
"GCGA": 440,
"GCGT": 441,
"GCGC": 442,
"GCGG": 443,
"GCGN": 444,
"GCNA": 445,
"GCNT": 446,
"GCNC": 447,
"GCNG": 448,
"GCNN": 449,
"GGAA": 450,
"GGAT": 451,
"GGAC": 452,
"GGAG": 453,
"GGAN": 454,
"GGTA": 455,
"GGTT": 456,
"GGTC": 457,
"GGTG": 458,
"GGTN": 459,
"GGCA": 460,
"GGCT": 461,
"GGCC": 462,
"GGCG": 463,
"GGCN": 464,
"GGGA": 465,
"GGGT": 466,
"GGGC": 467,
"GGGG": 468,
"GGGN": 469,
"GGNA": 470,
"GGNT": 471,
"GGNC": 472,
"GGNG": 473,
"GGNN": 474,
"GNAA": 475,
"GNAT": 476,
"GNAC": 477,
"GNAG": 478,
"GNAN": 479,
"GNTA": 480,
"GNTT": 481,
"GNTC": 482,
"GNTG": 483,
"GNTN": 484,
"GNCA": 485,
"GNCT": 486,
"GNCC": 487,
"GNCG": 488,
"GNCN": 489,
"GNGA": 490,
"GNGT": 491,
"GNGC": 492,
"GNGG": 493,
"GNGN": 494,
"GNNA": 495,
"GNNT": 496,
"GNNC": 497,
"GNNG": 498,
"GNNN": 499,
"NAAA": 500,
"NAAT": 501,
"NAAC": 502,
"NAAG": 503,
"NAAN": 504,
"NATA": 505,
"NATT": 506,
"NATC": 507,
"NATG": 508,
"NATN": 509,
"NACA": 510,
"NACT": 511,
"NACC": 512,
"NACG": 513,
"NACN": 514,
"NAGA": 515,
"NAGT": 516,
"NAGC": 517,
"NAGG": 518,
"NAGN": 519,
"NANA": 520,
"NANT": 521,
"NANC": 522,
"NANG": 523,
"NANN": 524,
"NTAA": 525,
"NTAT": 526,
"NTAC": 527,
"NTAG": 528,
"NTAN": 529,
"NTTA": 530,
"NTTT": 531,
"NTTC": 532,
"NTTG": 533,
"NTTN": 534,
"NTCA": 535,
"NTCT": 536,
"NTCC": 537,
"NTCG": 538,
"NTCN": 539,
"NTGA": 540,
"NTGT": 541,
"NTGC": 542,
"NTGG": 543,
"NTGN": 544,
"NTNA": 545,
"NTNT": 546,
"NTNC": 547,
"NTNG": 548,
"NTNN": 549,
"NCAA": 550,
"NCAT": 551,
"NCAC": 552,
"NCAG": 553,
"NCAN": 554,
"NCTA": 555,
"NCTT": 556,
"NCTC": 557,
"NCTG": 558,
"NCTN": 559,
"NCCA": 560,
"NCCT": 561,
"NCCC": 562,
"NCCG": 563,
"NCCN": 564,
"NCGA": 565,
"NCGT": 566,
"NCGC": 567,
"NCGG": 568,
"NCGN": 569,
"NCNA": 570,
"NCNT": 571,
"NCNC": 572,
"NCNG": 573,
"NCNN": 574,
"NGAA": 575,
"NGAT": 576,
"NGAC": 577,
"NGAG": 578,
"NGAN": 579,
"NGTA": 580,
"NGTT": 581,
"NGTC": 582,
"NGTG": 583,
"NGTN": 584,
"NGCA": 585,
"NGCT": 586,
"NGCC": 587,
"NGCG": 588,
"NGCN": 589,
"NGGA": 590,
"NGGT": 591,
"NGGC": 592,
"NGGG": 593,
"NGGN": 594,
"NGNA": 595,
"NGNT": 596,
"NGNC": 597,
"NGNG": 598,
"NGNN": 599,
"NNAA": 600,
"NNAT": 601,
"NNAC": 602,
"NNAG": 603,
"NNAN": 604,
"NNTA": 605,
"NNTT": 606,
"NNTC": 607,
"NNTG": 608,
"NNTN": 609,
"NNCA": 610,
"NNCT": 611,
"NNCC": 612,
"NNCG": 613,
"NNCN": 614,
"NNGA": 615,
"NNGT": 616,
"NNGC": 617,
"NNGG": 618,
"NNGN": 619,
"NNNA": 620,
"NNNT": 621,
"NNNC": 622,
"NNNG": 623,
"NNNN": 624,
"[MASK]": 625,
"[CLS]": 626,
"[PAD]": 627,
"[SEP]": 628,
"[UNK]": 629
}
}
}