Upload tokenizer
Browse files- special_tokens_map.json +7 -0
- tokenizer.json +780 -0
- tokenizer_config.json +57 -0
- vocab.txt +630 -0
special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
tokenizer.json
ADDED
@@ -0,0 +1,780 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"version": "1.0",
|
3 |
+
"truncation": null,
|
4 |
+
"padding": null,
|
5 |
+
"added_tokens": [
|
6 |
+
{
|
7 |
+
"id": 625,
|
8 |
+
"content": "[MASK]",
|
9 |
+
"single_word": false,
|
10 |
+
"lstrip": false,
|
11 |
+
"rstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"special": true
|
14 |
+
},
|
15 |
+
{
|
16 |
+
"id": 626,
|
17 |
+
"content": "[CLS]",
|
18 |
+
"single_word": false,
|
19 |
+
"lstrip": false,
|
20 |
+
"rstrip": false,
|
21 |
+
"normalized": false,
|
22 |
+
"special": true
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"id": 627,
|
26 |
+
"content": "[PAD]",
|
27 |
+
"single_word": false,
|
28 |
+
"lstrip": false,
|
29 |
+
"rstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"special": true
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"id": 628,
|
35 |
+
"content": "[SEP]",
|
36 |
+
"single_word": false,
|
37 |
+
"lstrip": false,
|
38 |
+
"rstrip": false,
|
39 |
+
"normalized": false,
|
40 |
+
"special": true
|
41 |
+
},
|
42 |
+
{
|
43 |
+
"id": 629,
|
44 |
+
"content": "[UNK]",
|
45 |
+
"single_word": false,
|
46 |
+
"lstrip": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"normalized": false,
|
49 |
+
"special": true
|
50 |
+
}
|
51 |
+
],
|
52 |
+
"normalizer": {
|
53 |
+
"type": "BertNormalizer",
|
54 |
+
"clean_text": true,
|
55 |
+
"handle_chinese_chars": true,
|
56 |
+
"strip_accents": null,
|
57 |
+
"lowercase": false
|
58 |
+
},
|
59 |
+
"pre_tokenizer": {
|
60 |
+
"type": "BertPreTokenizer"
|
61 |
+
},
|
62 |
+
"post_processor": {
|
63 |
+
"type": "TemplateProcessing",
|
64 |
+
"single": [
|
65 |
+
{
|
66 |
+
"SpecialToken": {
|
67 |
+
"id": "[CLS]",
|
68 |
+
"type_id": 0
|
69 |
+
}
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"Sequence": {
|
73 |
+
"id": "A",
|
74 |
+
"type_id": 0
|
75 |
+
}
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"SpecialToken": {
|
79 |
+
"id": "[SEP]",
|
80 |
+
"type_id": 0
|
81 |
+
}
|
82 |
+
}
|
83 |
+
],
|
84 |
+
"pair": [
|
85 |
+
{
|
86 |
+
"SpecialToken": {
|
87 |
+
"id": "[CLS]",
|
88 |
+
"type_id": 0
|
89 |
+
}
|
90 |
+
},
|
91 |
+
{
|
92 |
+
"Sequence": {
|
93 |
+
"id": "A",
|
94 |
+
"type_id": 0
|
95 |
+
}
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"SpecialToken": {
|
99 |
+
"id": "[SEP]",
|
100 |
+
"type_id": 0
|
101 |
+
}
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"Sequence": {
|
105 |
+
"id": "B",
|
106 |
+
"type_id": 1
|
107 |
+
}
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"SpecialToken": {
|
111 |
+
"id": "[SEP]",
|
112 |
+
"type_id": 1
|
113 |
+
}
|
114 |
+
}
|
115 |
+
],
|
116 |
+
"special_tokens": {
|
117 |
+
"[CLS]": {
|
118 |
+
"id": "[CLS]",
|
119 |
+
"ids": [
|
120 |
+
626
|
121 |
+
],
|
122 |
+
"tokens": [
|
123 |
+
"[CLS]"
|
124 |
+
]
|
125 |
+
},
|
126 |
+
"[SEP]": {
|
127 |
+
"id": "[SEP]",
|
128 |
+
"ids": [
|
129 |
+
628
|
130 |
+
],
|
131 |
+
"tokens": [
|
132 |
+
"[SEP]"
|
133 |
+
]
|
134 |
+
}
|
135 |
+
}
|
136 |
+
},
|
137 |
+
"decoder": {
|
138 |
+
"type": "WordPiece",
|
139 |
+
"prefix": "##",
|
140 |
+
"cleanup": true
|
141 |
+
},
|
142 |
+
"model": {
|
143 |
+
"type": "WordPiece",
|
144 |
+
"unk_token": "[UNK]",
|
145 |
+
"continuing_subword_prefix": "##",
|
146 |
+
"max_input_chars_per_word": 100,
|
147 |
+
"vocab": {
|
148 |
+
"AAAA": 0,
|
149 |
+
"AAAT": 1,
|
150 |
+
"AAAC": 2,
|
151 |
+
"AAAG": 3,
|
152 |
+
"AAAN": 4,
|
153 |
+
"AATA": 5,
|
154 |
+
"AATT": 6,
|
155 |
+
"AATC": 7,
|
156 |
+
"AATG": 8,
|
157 |
+
"AATN": 9,
|
158 |
+
"AACA": 10,
|
159 |
+
"AACT": 11,
|
160 |
+
"AACC": 12,
|
161 |
+
"AACG": 13,
|
162 |
+
"AACN": 14,
|
163 |
+
"AAGA": 15,
|
164 |
+
"AAGT": 16,
|
165 |
+
"AAGC": 17,
|
166 |
+
"AAGG": 18,
|
167 |
+
"AAGN": 19,
|
168 |
+
"AANA": 20,
|
169 |
+
"AANT": 21,
|
170 |
+
"AANC": 22,
|
171 |
+
"AANG": 23,
|
172 |
+
"AANN": 24,
|
173 |
+
"ATAA": 25,
|
174 |
+
"ATAT": 26,
|
175 |
+
"ATAC": 27,
|
176 |
+
"ATAG": 28,
|
177 |
+
"ATAN": 29,
|
178 |
+
"ATTA": 30,
|
179 |
+
"ATTT": 31,
|
180 |
+
"ATTC": 32,
|
181 |
+
"ATTG": 33,
|
182 |
+
"ATTN": 34,
|
183 |
+
"ATCA": 35,
|
184 |
+
"ATCT": 36,
|
185 |
+
"ATCC": 37,
|
186 |
+
"ATCG": 38,
|
187 |
+
"ATCN": 39,
|
188 |
+
"ATGA": 40,
|
189 |
+
"ATGT": 41,
|
190 |
+
"ATGC": 42,
|
191 |
+
"ATGG": 43,
|
192 |
+
"ATGN": 44,
|
193 |
+
"ATNA": 45,
|
194 |
+
"ATNT": 46,
|
195 |
+
"ATNC": 47,
|
196 |
+
"ATNG": 48,
|
197 |
+
"ATNN": 49,
|
198 |
+
"ACAA": 50,
|
199 |
+
"ACAT": 51,
|
200 |
+
"ACAC": 52,
|
201 |
+
"ACAG": 53,
|
202 |
+
"ACAN": 54,
|
203 |
+
"ACTA": 55,
|
204 |
+
"ACTT": 56,
|
205 |
+
"ACTC": 57,
|
206 |
+
"ACTG": 58,
|
207 |
+
"ACTN": 59,
|
208 |
+
"ACCA": 60,
|
209 |
+
"ACCT": 61,
|
210 |
+
"ACCC": 62,
|
211 |
+
"ACCG": 63,
|
212 |
+
"ACCN": 64,
|
213 |
+
"ACGA": 65,
|
214 |
+
"ACGT": 66,
|
215 |
+
"ACGC": 67,
|
216 |
+
"ACGG": 68,
|
217 |
+
"ACGN": 69,
|
218 |
+
"ACNA": 70,
|
219 |
+
"ACNT": 71,
|
220 |
+
"ACNC": 72,
|
221 |
+
"ACNG": 73,
|
222 |
+
"ACNN": 74,
|
223 |
+
"AGAA": 75,
|
224 |
+
"AGAT": 76,
|
225 |
+
"AGAC": 77,
|
226 |
+
"AGAG": 78,
|
227 |
+
"AGAN": 79,
|
228 |
+
"AGTA": 80,
|
229 |
+
"AGTT": 81,
|
230 |
+
"AGTC": 82,
|
231 |
+
"AGTG": 83,
|
232 |
+
"AGTN": 84,
|
233 |
+
"AGCA": 85,
|
234 |
+
"AGCT": 86,
|
235 |
+
"AGCC": 87,
|
236 |
+
"AGCG": 88,
|
237 |
+
"AGCN": 89,
|
238 |
+
"AGGA": 90,
|
239 |
+
"AGGT": 91,
|
240 |
+
"AGGC": 92,
|
241 |
+
"AGGG": 93,
|
242 |
+
"AGGN": 94,
|
243 |
+
"AGNA": 95,
|
244 |
+
"AGNT": 96,
|
245 |
+
"AGNC": 97,
|
246 |
+
"AGNG": 98,
|
247 |
+
"AGNN": 99,
|
248 |
+
"ANAA": 100,
|
249 |
+
"ANAT": 101,
|
250 |
+
"ANAC": 102,
|
251 |
+
"ANAG": 103,
|
252 |
+
"ANAN": 104,
|
253 |
+
"ANTA": 105,
|
254 |
+
"ANTT": 106,
|
255 |
+
"ANTC": 107,
|
256 |
+
"ANTG": 108,
|
257 |
+
"ANTN": 109,
|
258 |
+
"ANCA": 110,
|
259 |
+
"ANCT": 111,
|
260 |
+
"ANCC": 112,
|
261 |
+
"ANCG": 113,
|
262 |
+
"ANCN": 114,
|
263 |
+
"ANGA": 115,
|
264 |
+
"ANGT": 116,
|
265 |
+
"ANGC": 117,
|
266 |
+
"ANGG": 118,
|
267 |
+
"ANGN": 119,
|
268 |
+
"ANNA": 120,
|
269 |
+
"ANNT": 121,
|
270 |
+
"ANNC": 122,
|
271 |
+
"ANNG": 123,
|
272 |
+
"ANNN": 124,
|
273 |
+
"TAAA": 125,
|
274 |
+
"TAAT": 126,
|
275 |
+
"TAAC": 127,
|
276 |
+
"TAAG": 128,
|
277 |
+
"TAAN": 129,
|
278 |
+
"TATA": 130,
|
279 |
+
"TATT": 131,
|
280 |
+
"TATC": 132,
|
281 |
+
"TATG": 133,
|
282 |
+
"TATN": 134,
|
283 |
+
"TACA": 135,
|
284 |
+
"TACT": 136,
|
285 |
+
"TACC": 137,
|
286 |
+
"TACG": 138,
|
287 |
+
"TACN": 139,
|
288 |
+
"TAGA": 140,
|
289 |
+
"TAGT": 141,
|
290 |
+
"TAGC": 142,
|
291 |
+
"TAGG": 143,
|
292 |
+
"TAGN": 144,
|
293 |
+
"TANA": 145,
|
294 |
+
"TANT": 146,
|
295 |
+
"TANC": 147,
|
296 |
+
"TANG": 148,
|
297 |
+
"TANN": 149,
|
298 |
+
"TTAA": 150,
|
299 |
+
"TTAT": 151,
|
300 |
+
"TTAC": 152,
|
301 |
+
"TTAG": 153,
|
302 |
+
"TTAN": 154,
|
303 |
+
"TTTA": 155,
|
304 |
+
"TTTT": 156,
|
305 |
+
"TTTC": 157,
|
306 |
+
"TTTG": 158,
|
307 |
+
"TTTN": 159,
|
308 |
+
"TTCA": 160,
|
309 |
+
"TTCT": 161,
|
310 |
+
"TTCC": 162,
|
311 |
+
"TTCG": 163,
|
312 |
+
"TTCN": 164,
|
313 |
+
"TTGA": 165,
|
314 |
+
"TTGT": 166,
|
315 |
+
"TTGC": 167,
|
316 |
+
"TTGG": 168,
|
317 |
+
"TTGN": 169,
|
318 |
+
"TTNA": 170,
|
319 |
+
"TTNT": 171,
|
320 |
+
"TTNC": 172,
|
321 |
+
"TTNG": 173,
|
322 |
+
"TTNN": 174,
|
323 |
+
"TCAA": 175,
|
324 |
+
"TCAT": 176,
|
325 |
+
"TCAC": 177,
|
326 |
+
"TCAG": 178,
|
327 |
+
"TCAN": 179,
|
328 |
+
"TCTA": 180,
|
329 |
+
"TCTT": 181,
|
330 |
+
"TCTC": 182,
|
331 |
+
"TCTG": 183,
|
332 |
+
"TCTN": 184,
|
333 |
+
"TCCA": 185,
|
334 |
+
"TCCT": 186,
|
335 |
+
"TCCC": 187,
|
336 |
+
"TCCG": 188,
|
337 |
+
"TCCN": 189,
|
338 |
+
"TCGA": 190,
|
339 |
+
"TCGT": 191,
|
340 |
+
"TCGC": 192,
|
341 |
+
"TCGG": 193,
|
342 |
+
"TCGN": 194,
|
343 |
+
"TCNA": 195,
|
344 |
+
"TCNT": 196,
|
345 |
+
"TCNC": 197,
|
346 |
+
"TCNG": 198,
|
347 |
+
"TCNN": 199,
|
348 |
+
"TGAA": 200,
|
349 |
+
"TGAT": 201,
|
350 |
+
"TGAC": 202,
|
351 |
+
"TGAG": 203,
|
352 |
+
"TGAN": 204,
|
353 |
+
"TGTA": 205,
|
354 |
+
"TGTT": 206,
|
355 |
+
"TGTC": 207,
|
356 |
+
"TGTG": 208,
|
357 |
+
"TGTN": 209,
|
358 |
+
"TGCA": 210,
|
359 |
+
"TGCT": 211,
|
360 |
+
"TGCC": 212,
|
361 |
+
"TGCG": 213,
|
362 |
+
"TGCN": 214,
|
363 |
+
"TGGA": 215,
|
364 |
+
"TGGT": 216,
|
365 |
+
"TGGC": 217,
|
366 |
+
"TGGG": 218,
|
367 |
+
"TGGN": 219,
|
368 |
+
"TGNA": 220,
|
369 |
+
"TGNT": 221,
|
370 |
+
"TGNC": 222,
|
371 |
+
"TGNG": 223,
|
372 |
+
"TGNN": 224,
|
373 |
+
"TNAA": 225,
|
374 |
+
"TNAT": 226,
|
375 |
+
"TNAC": 227,
|
376 |
+
"TNAG": 228,
|
377 |
+
"TNAN": 229,
|
378 |
+
"TNTA": 230,
|
379 |
+
"TNTT": 231,
|
380 |
+
"TNTC": 232,
|
381 |
+
"TNTG": 233,
|
382 |
+
"TNTN": 234,
|
383 |
+
"TNCA": 235,
|
384 |
+
"TNCT": 236,
|
385 |
+
"TNCC": 237,
|
386 |
+
"TNCG": 238,
|
387 |
+
"TNCN": 239,
|
388 |
+
"TNGA": 240,
|
389 |
+
"TNGT": 241,
|
390 |
+
"TNGC": 242,
|
391 |
+
"TNGG": 243,
|
392 |
+
"TNGN": 244,
|
393 |
+
"TNNA": 245,
|
394 |
+
"TNNT": 246,
|
395 |
+
"TNNC": 247,
|
396 |
+
"TNNG": 248,
|
397 |
+
"TNNN": 249,
|
398 |
+
"CAAA": 250,
|
399 |
+
"CAAT": 251,
|
400 |
+
"CAAC": 252,
|
401 |
+
"CAAG": 253,
|
402 |
+
"CAAN": 254,
|
403 |
+
"CATA": 255,
|
404 |
+
"CATT": 256,
|
405 |
+
"CATC": 257,
|
406 |
+
"CATG": 258,
|
407 |
+
"CATN": 259,
|
408 |
+
"CACA": 260,
|
409 |
+
"CACT": 261,
|
410 |
+
"CACC": 262,
|
411 |
+
"CACG": 263,
|
412 |
+
"CACN": 264,
|
413 |
+
"CAGA": 265,
|
414 |
+
"CAGT": 266,
|
415 |
+
"CAGC": 267,
|
416 |
+
"CAGG": 268,
|
417 |
+
"CAGN": 269,
|
418 |
+
"CANA": 270,
|
419 |
+
"CANT": 271,
|
420 |
+
"CANC": 272,
|
421 |
+
"CANG": 273,
|
422 |
+
"CANN": 274,
|
423 |
+
"CTAA": 275,
|
424 |
+
"CTAT": 276,
|
425 |
+
"CTAC": 277,
|
426 |
+
"CTAG": 278,
|
427 |
+
"CTAN": 279,
|
428 |
+
"CTTA": 280,
|
429 |
+
"CTTT": 281,
|
430 |
+
"CTTC": 282,
|
431 |
+
"CTTG": 283,
|
432 |
+
"CTTN": 284,
|
433 |
+
"CTCA": 285,
|
434 |
+
"CTCT": 286,
|
435 |
+
"CTCC": 287,
|
436 |
+
"CTCG": 288,
|
437 |
+
"CTCN": 289,
|
438 |
+
"CTGA": 290,
|
439 |
+
"CTGT": 291,
|
440 |
+
"CTGC": 292,
|
441 |
+
"CTGG": 293,
|
442 |
+
"CTGN": 294,
|
443 |
+
"CTNA": 295,
|
444 |
+
"CTNT": 296,
|
445 |
+
"CTNC": 297,
|
446 |
+
"CTNG": 298,
|
447 |
+
"CTNN": 299,
|
448 |
+
"CCAA": 300,
|
449 |
+
"CCAT": 301,
|
450 |
+
"CCAC": 302,
|
451 |
+
"CCAG": 303,
|
452 |
+
"CCAN": 304,
|
453 |
+
"CCTA": 305,
|
454 |
+
"CCTT": 306,
|
455 |
+
"CCTC": 307,
|
456 |
+
"CCTG": 308,
|
457 |
+
"CCTN": 309,
|
458 |
+
"CCCA": 310,
|
459 |
+
"CCCT": 311,
|
460 |
+
"CCCC": 312,
|
461 |
+
"CCCG": 313,
|
462 |
+
"CCCN": 314,
|
463 |
+
"CCGA": 315,
|
464 |
+
"CCGT": 316,
|
465 |
+
"CCGC": 317,
|
466 |
+
"CCGG": 318,
|
467 |
+
"CCGN": 319,
|
468 |
+
"CCNA": 320,
|
469 |
+
"CCNT": 321,
|
470 |
+
"CCNC": 322,
|
471 |
+
"CCNG": 323,
|
472 |
+
"CCNN": 324,
|
473 |
+
"CGAA": 325,
|
474 |
+
"CGAT": 326,
|
475 |
+
"CGAC": 327,
|
476 |
+
"CGAG": 328,
|
477 |
+
"CGAN": 329,
|
478 |
+
"CGTA": 330,
|
479 |
+
"CGTT": 331,
|
480 |
+
"CGTC": 332,
|
481 |
+
"CGTG": 333,
|
482 |
+
"CGTN": 334,
|
483 |
+
"CGCA": 335,
|
484 |
+
"CGCT": 336,
|
485 |
+
"CGCC": 337,
|
486 |
+
"CGCG": 338,
|
487 |
+
"CGCN": 339,
|
488 |
+
"CGGA": 340,
|
489 |
+
"CGGT": 341,
|
490 |
+
"CGGC": 342,
|
491 |
+
"CGGG": 343,
|
492 |
+
"CGGN": 344,
|
493 |
+
"CGNA": 345,
|
494 |
+
"CGNT": 346,
|
495 |
+
"CGNC": 347,
|
496 |
+
"CGNG": 348,
|
497 |
+
"CGNN": 349,
|
498 |
+
"CNAA": 350,
|
499 |
+
"CNAT": 351,
|
500 |
+
"CNAC": 352,
|
501 |
+
"CNAG": 353,
|
502 |
+
"CNAN": 354,
|
503 |
+
"CNTA": 355,
|
504 |
+
"CNTT": 356,
|
505 |
+
"CNTC": 357,
|
506 |
+
"CNTG": 358,
|
507 |
+
"CNTN": 359,
|
508 |
+
"CNCA": 360,
|
509 |
+
"CNCT": 361,
|
510 |
+
"CNCC": 362,
|
511 |
+
"CNCG": 363,
|
512 |
+
"CNCN": 364,
|
513 |
+
"CNGA": 365,
|
514 |
+
"CNGT": 366,
|
515 |
+
"CNGC": 367,
|
516 |
+
"CNGG": 368,
|
517 |
+
"CNGN": 369,
|
518 |
+
"CNNA": 370,
|
519 |
+
"CNNT": 371,
|
520 |
+
"CNNC": 372,
|
521 |
+
"CNNG": 373,
|
522 |
+
"CNNN": 374,
|
523 |
+
"GAAA": 375,
|
524 |
+
"GAAT": 376,
|
525 |
+
"GAAC": 377,
|
526 |
+
"GAAG": 378,
|
527 |
+
"GAAN": 379,
|
528 |
+
"GATA": 380,
|
529 |
+
"GATT": 381,
|
530 |
+
"GATC": 382,
|
531 |
+
"GATG": 383,
|
532 |
+
"GATN": 384,
|
533 |
+
"GACA": 385,
|
534 |
+
"GACT": 386,
|
535 |
+
"GACC": 387,
|
536 |
+
"GACG": 388,
|
537 |
+
"GACN": 389,
|
538 |
+
"GAGA": 390,
|
539 |
+
"GAGT": 391,
|
540 |
+
"GAGC": 392,
|
541 |
+
"GAGG": 393,
|
542 |
+
"GAGN": 394,
|
543 |
+
"GANA": 395,
|
544 |
+
"GANT": 396,
|
545 |
+
"GANC": 397,
|
546 |
+
"GANG": 398,
|
547 |
+
"GANN": 399,
|
548 |
+
"GTAA": 400,
|
549 |
+
"GTAT": 401,
|
550 |
+
"GTAC": 402,
|
551 |
+
"GTAG": 403,
|
552 |
+
"GTAN": 404,
|
553 |
+
"GTTA": 405,
|
554 |
+
"GTTT": 406,
|
555 |
+
"GTTC": 407,
|
556 |
+
"GTTG": 408,
|
557 |
+
"GTTN": 409,
|
558 |
+
"GTCA": 410,
|
559 |
+
"GTCT": 411,
|
560 |
+
"GTCC": 412,
|
561 |
+
"GTCG": 413,
|
562 |
+
"GTCN": 414,
|
563 |
+
"GTGA": 415,
|
564 |
+
"GTGT": 416,
|
565 |
+
"GTGC": 417,
|
566 |
+
"GTGG": 418,
|
567 |
+
"GTGN": 419,
|
568 |
+
"GTNA": 420,
|
569 |
+
"GTNT": 421,
|
570 |
+
"GTNC": 422,
|
571 |
+
"GTNG": 423,
|
572 |
+
"GTNN": 424,
|
573 |
+
"GCAA": 425,
|
574 |
+
"GCAT": 426,
|
575 |
+
"GCAC": 427,
|
576 |
+
"GCAG": 428,
|
577 |
+
"GCAN": 429,
|
578 |
+
"GCTA": 430,
|
579 |
+
"GCTT": 431,
|
580 |
+
"GCTC": 432,
|
581 |
+
"GCTG": 433,
|
582 |
+
"GCTN": 434,
|
583 |
+
"GCCA": 435,
|
584 |
+
"GCCT": 436,
|
585 |
+
"GCCC": 437,
|
586 |
+
"GCCG": 438,
|
587 |
+
"GCCN": 439,
|
588 |
+
"GCGA": 440,
|
589 |
+
"GCGT": 441,
|
590 |
+
"GCGC": 442,
|
591 |
+
"GCGG": 443,
|
592 |
+
"GCGN": 444,
|
593 |
+
"GCNA": 445,
|
594 |
+
"GCNT": 446,
|
595 |
+
"GCNC": 447,
|
596 |
+
"GCNG": 448,
|
597 |
+
"GCNN": 449,
|
598 |
+
"GGAA": 450,
|
599 |
+
"GGAT": 451,
|
600 |
+
"GGAC": 452,
|
601 |
+
"GGAG": 453,
|
602 |
+
"GGAN": 454,
|
603 |
+
"GGTA": 455,
|
604 |
+
"GGTT": 456,
|
605 |
+
"GGTC": 457,
|
606 |
+
"GGTG": 458,
|
607 |
+
"GGTN": 459,
|
608 |
+
"GGCA": 460,
|
609 |
+
"GGCT": 461,
|
610 |
+
"GGCC": 462,
|
611 |
+
"GGCG": 463,
|
612 |
+
"GGCN": 464,
|
613 |
+
"GGGA": 465,
|
614 |
+
"GGGT": 466,
|
615 |
+
"GGGC": 467,
|
616 |
+
"GGGG": 468,
|
617 |
+
"GGGN": 469,
|
618 |
+
"GGNA": 470,
|
619 |
+
"GGNT": 471,
|
620 |
+
"GGNC": 472,
|
621 |
+
"GGNG": 473,
|
622 |
+
"GGNN": 474,
|
623 |
+
"GNAA": 475,
|
624 |
+
"GNAT": 476,
|
625 |
+
"GNAC": 477,
|
626 |
+
"GNAG": 478,
|
627 |
+
"GNAN": 479,
|
628 |
+
"GNTA": 480,
|
629 |
+
"GNTT": 481,
|
630 |
+
"GNTC": 482,
|
631 |
+
"GNTG": 483,
|
632 |
+
"GNTN": 484,
|
633 |
+
"GNCA": 485,
|
634 |
+
"GNCT": 486,
|
635 |
+
"GNCC": 487,
|
636 |
+
"GNCG": 488,
|
637 |
+
"GNCN": 489,
|
638 |
+
"GNGA": 490,
|
639 |
+
"GNGT": 491,
|
640 |
+
"GNGC": 492,
|
641 |
+
"GNGG": 493,
|
642 |
+
"GNGN": 494,
|
643 |
+
"GNNA": 495,
|
644 |
+
"GNNT": 496,
|
645 |
+
"GNNC": 497,
|
646 |
+
"GNNG": 498,
|
647 |
+
"GNNN": 499,
|
648 |
+
"NAAA": 500,
|
649 |
+
"NAAT": 501,
|
650 |
+
"NAAC": 502,
|
651 |
+
"NAAG": 503,
|
652 |
+
"NAAN": 504,
|
653 |
+
"NATA": 505,
|
654 |
+
"NATT": 506,
|
655 |
+
"NATC": 507,
|
656 |
+
"NATG": 508,
|
657 |
+
"NATN": 509,
|
658 |
+
"NACA": 510,
|
659 |
+
"NACT": 511,
|
660 |
+
"NACC": 512,
|
661 |
+
"NACG": 513,
|
662 |
+
"NACN": 514,
|
663 |
+
"NAGA": 515,
|
664 |
+
"NAGT": 516,
|
665 |
+
"NAGC": 517,
|
666 |
+
"NAGG": 518,
|
667 |
+
"NAGN": 519,
|
668 |
+
"NANA": 520,
|
669 |
+
"NANT": 521,
|
670 |
+
"NANC": 522,
|
671 |
+
"NANG": 523,
|
672 |
+
"NANN": 524,
|
673 |
+
"NTAA": 525,
|
674 |
+
"NTAT": 526,
|
675 |
+
"NTAC": 527,
|
676 |
+
"NTAG": 528,
|
677 |
+
"NTAN": 529,
|
678 |
+
"NTTA": 530,
|
679 |
+
"NTTT": 531,
|
680 |
+
"NTTC": 532,
|
681 |
+
"NTTG": 533,
|
682 |
+
"NTTN": 534,
|
683 |
+
"NTCA": 535,
|
684 |
+
"NTCT": 536,
|
685 |
+
"NTCC": 537,
|
686 |
+
"NTCG": 538,
|
687 |
+
"NTCN": 539,
|
688 |
+
"NTGA": 540,
|
689 |
+
"NTGT": 541,
|
690 |
+
"NTGC": 542,
|
691 |
+
"NTGG": 543,
|
692 |
+
"NTGN": 544,
|
693 |
+
"NTNA": 545,
|
694 |
+
"NTNT": 546,
|
695 |
+
"NTNC": 547,
|
696 |
+
"NTNG": 548,
|
697 |
+
"NTNN": 549,
|
698 |
+
"NCAA": 550,
|
699 |
+
"NCAT": 551,
|
700 |
+
"NCAC": 552,
|
701 |
+
"NCAG": 553,
|
702 |
+
"NCAN": 554,
|
703 |
+
"NCTA": 555,
|
704 |
+
"NCTT": 556,
|
705 |
+
"NCTC": 557,
|
706 |
+
"NCTG": 558,
|
707 |
+
"NCTN": 559,
|
708 |
+
"NCCA": 560,
|
709 |
+
"NCCT": 561,
|
710 |
+
"NCCC": 562,
|
711 |
+
"NCCG": 563,
|
712 |
+
"NCCN": 564,
|
713 |
+
"NCGA": 565,
|
714 |
+
"NCGT": 566,
|
715 |
+
"NCGC": 567,
|
716 |
+
"NCGG": 568,
|
717 |
+
"NCGN": 569,
|
718 |
+
"NCNA": 570,
|
719 |
+
"NCNT": 571,
|
720 |
+
"NCNC": 572,
|
721 |
+
"NCNG": 573,
|
722 |
+
"NCNN": 574,
|
723 |
+
"NGAA": 575,
|
724 |
+
"NGAT": 576,
|
725 |
+
"NGAC": 577,
|
726 |
+
"NGAG": 578,
|
727 |
+
"NGAN": 579,
|
728 |
+
"NGTA": 580,
|
729 |
+
"NGTT": 581,
|
730 |
+
"NGTC": 582,
|
731 |
+
"NGTG": 583,
|
732 |
+
"NGTN": 584,
|
733 |
+
"NGCA": 585,
|
734 |
+
"NGCT": 586,
|
735 |
+
"NGCC": 587,
|
736 |
+
"NGCG": 588,
|
737 |
+
"NGCN": 589,
|
738 |
+
"NGGA": 590,
|
739 |
+
"NGGT": 591,
|
740 |
+
"NGGC": 592,
|
741 |
+
"NGGG": 593,
|
742 |
+
"NGGN": 594,
|
743 |
+
"NGNA": 595,
|
744 |
+
"NGNT": 596,
|
745 |
+
"NGNC": 597,
|
746 |
+
"NGNG": 598,
|
747 |
+
"NGNN": 599,
|
748 |
+
"NNAA": 600,
|
749 |
+
"NNAT": 601,
|
750 |
+
"NNAC": 602,
|
751 |
+
"NNAG": 603,
|
752 |
+
"NNAN": 604,
|
753 |
+
"NNTA": 605,
|
754 |
+
"NNTT": 606,
|
755 |
+
"NNTC": 607,
|
756 |
+
"NNTG": 608,
|
757 |
+
"NNTN": 609,
|
758 |
+
"NNCA": 610,
|
759 |
+
"NNCT": 611,
|
760 |
+
"NNCC": 612,
|
761 |
+
"NNCG": 613,
|
762 |
+
"NNCN": 614,
|
763 |
+
"NNGA": 615,
|
764 |
+
"NNGT": 616,
|
765 |
+
"NNGC": 617,
|
766 |
+
"NNGG": 618,
|
767 |
+
"NNGN": 619,
|
768 |
+
"NNNA": 620,
|
769 |
+
"NNNT": 621,
|
770 |
+
"NNNC": 622,
|
771 |
+
"NNNG": 623,
|
772 |
+
"NNNN": 624,
|
773 |
+
"[MASK]": 625,
|
774 |
+
"[CLS]": 626,
|
775 |
+
"[PAD]": 627,
|
776 |
+
"[SEP]": 628,
|
777 |
+
"[UNK]": 629
|
778 |
+
}
|
779 |
+
}
|
780 |
+
}
|
tokenizer_config.json
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"625": {
|
4 |
+
"content": "[MASK]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"626": {
|
12 |
+
"content": "[CLS]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"627": {
|
20 |
+
"content": "[PAD]",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"628": {
|
28 |
+
"content": "[SEP]",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"629": {
|
36 |
+
"content": "[UNK]",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
}
|
43 |
+
},
|
44 |
+
"clean_up_tokenization_spaces": true,
|
45 |
+
"cls_token": "[CLS]",
|
46 |
+
"do_basic_tokenize": true,
|
47 |
+
"do_lower_case": false,
|
48 |
+
"mask_token": "[MASK]",
|
49 |
+
"model_max_length": 512,
|
50 |
+
"never_split": null,
|
51 |
+
"pad_token": "[PAD]",
|
52 |
+
"sep_token": "[SEP]",
|
53 |
+
"strip_accents": null,
|
54 |
+
"tokenize_chinese_chars": true,
|
55 |
+
"tokenizer_class": "BertTokenizer",
|
56 |
+
"unk_token": "[UNK]"
|
57 |
+
}
|
vocab.txt
ADDED
@@ -0,0 +1,630 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
AAAA
|
2 |
+
AAAT
|
3 |
+
AAAC
|
4 |
+
AAAG
|
5 |
+
AAAN
|
6 |
+
AATA
|
7 |
+
AATT
|
8 |
+
AATC
|
9 |
+
AATG
|
10 |
+
AATN
|
11 |
+
AACA
|
12 |
+
AACT
|
13 |
+
AACC
|
14 |
+
AACG
|
15 |
+
AACN
|
16 |
+
AAGA
|
17 |
+
AAGT
|
18 |
+
AAGC
|
19 |
+
AAGG
|
20 |
+
AAGN
|
21 |
+
AANA
|
22 |
+
AANT
|
23 |
+
AANC
|
24 |
+
AANG
|
25 |
+
AANN
|
26 |
+
ATAA
|
27 |
+
ATAT
|
28 |
+
ATAC
|
29 |
+
ATAG
|
30 |
+
ATAN
|
31 |
+
ATTA
|
32 |
+
ATTT
|
33 |
+
ATTC
|
34 |
+
ATTG
|
35 |
+
ATTN
|
36 |
+
ATCA
|
37 |
+
ATCT
|
38 |
+
ATCC
|
39 |
+
ATCG
|
40 |
+
ATCN
|
41 |
+
ATGA
|
42 |
+
ATGT
|
43 |
+
ATGC
|
44 |
+
ATGG
|
45 |
+
ATGN
|
46 |
+
ATNA
|
47 |
+
ATNT
|
48 |
+
ATNC
|
49 |
+
ATNG
|
50 |
+
ATNN
|
51 |
+
ACAA
|
52 |
+
ACAT
|
53 |
+
ACAC
|
54 |
+
ACAG
|
55 |
+
ACAN
|
56 |
+
ACTA
|
57 |
+
ACTT
|
58 |
+
ACTC
|
59 |
+
ACTG
|
60 |
+
ACTN
|
61 |
+
ACCA
|
62 |
+
ACCT
|
63 |
+
ACCC
|
64 |
+
ACCG
|
65 |
+
ACCN
|
66 |
+
ACGA
|
67 |
+
ACGT
|
68 |
+
ACGC
|
69 |
+
ACGG
|
70 |
+
ACGN
|
71 |
+
ACNA
|
72 |
+
ACNT
|
73 |
+
ACNC
|
74 |
+
ACNG
|
75 |
+
ACNN
|
76 |
+
AGAA
|
77 |
+
AGAT
|
78 |
+
AGAC
|
79 |
+
AGAG
|
80 |
+
AGAN
|
81 |
+
AGTA
|
82 |
+
AGTT
|
83 |
+
AGTC
|
84 |
+
AGTG
|
85 |
+
AGTN
|
86 |
+
AGCA
|
87 |
+
AGCT
|
88 |
+
AGCC
|
89 |
+
AGCG
|
90 |
+
AGCN
|
91 |
+
AGGA
|
92 |
+
AGGT
|
93 |
+
AGGC
|
94 |
+
AGGG
|
95 |
+
AGGN
|
96 |
+
AGNA
|
97 |
+
AGNT
|
98 |
+
AGNC
|
99 |
+
AGNG
|
100 |
+
AGNN
|
101 |
+
ANAA
|
102 |
+
ANAT
|
103 |
+
ANAC
|
104 |
+
ANAG
|
105 |
+
ANAN
|
106 |
+
ANTA
|
107 |
+
ANTT
|
108 |
+
ANTC
|
109 |
+
ANTG
|
110 |
+
ANTN
|
111 |
+
ANCA
|
112 |
+
ANCT
|
113 |
+
ANCC
|
114 |
+
ANCG
|
115 |
+
ANCN
|
116 |
+
ANGA
|
117 |
+
ANGT
|
118 |
+
ANGC
|
119 |
+
ANGG
|
120 |
+
ANGN
|
121 |
+
ANNA
|
122 |
+
ANNT
|
123 |
+
ANNC
|
124 |
+
ANNG
|
125 |
+
ANNN
|
126 |
+
TAAA
|
127 |
+
TAAT
|
128 |
+
TAAC
|
129 |
+
TAAG
|
130 |
+
TAAN
|
131 |
+
TATA
|
132 |
+
TATT
|
133 |
+
TATC
|
134 |
+
TATG
|
135 |
+
TATN
|
136 |
+
TACA
|
137 |
+
TACT
|
138 |
+
TACC
|
139 |
+
TACG
|
140 |
+
TACN
|
141 |
+
TAGA
|
142 |
+
TAGT
|
143 |
+
TAGC
|
144 |
+
TAGG
|
145 |
+
TAGN
|
146 |
+
TANA
|
147 |
+
TANT
|
148 |
+
TANC
|
149 |
+
TANG
|
150 |
+
TANN
|
151 |
+
TTAA
|
152 |
+
TTAT
|
153 |
+
TTAC
|
154 |
+
TTAG
|
155 |
+
TTAN
|
156 |
+
TTTA
|
157 |
+
TTTT
|
158 |
+
TTTC
|
159 |
+
TTTG
|
160 |
+
TTTN
|
161 |
+
TTCA
|
162 |
+
TTCT
|
163 |
+
TTCC
|
164 |
+
TTCG
|
165 |
+
TTCN
|
166 |
+
TTGA
|
167 |
+
TTGT
|
168 |
+
TTGC
|
169 |
+
TTGG
|
170 |
+
TTGN
|
171 |
+
TTNA
|
172 |
+
TTNT
|
173 |
+
TTNC
|
174 |
+
TTNG
|
175 |
+
TTNN
|
176 |
+
TCAA
|
177 |
+
TCAT
|
178 |
+
TCAC
|
179 |
+
TCAG
|
180 |
+
TCAN
|
181 |
+
TCTA
|
182 |
+
TCTT
|
183 |
+
TCTC
|
184 |
+
TCTG
|
185 |
+
TCTN
|
186 |
+
TCCA
|
187 |
+
TCCT
|
188 |
+
TCCC
|
189 |
+
TCCG
|
190 |
+
TCCN
|
191 |
+
TCGA
|
192 |
+
TCGT
|
193 |
+
TCGC
|
194 |
+
TCGG
|
195 |
+
TCGN
|
196 |
+
TCNA
|
197 |
+
TCNT
|
198 |
+
TCNC
|
199 |
+
TCNG
|
200 |
+
TCNN
|
201 |
+
TGAA
|
202 |
+
TGAT
|
203 |
+
TGAC
|
204 |
+
TGAG
|
205 |
+
TGAN
|
206 |
+
TGTA
|
207 |
+
TGTT
|
208 |
+
TGTC
|
209 |
+
TGTG
|
210 |
+
TGTN
|
211 |
+
TGCA
|
212 |
+
TGCT
|
213 |
+
TGCC
|
214 |
+
TGCG
|
215 |
+
TGCN
|
216 |
+
TGGA
|
217 |
+
TGGT
|
218 |
+
TGGC
|
219 |
+
TGGG
|
220 |
+
TGGN
|
221 |
+
TGNA
|
222 |
+
TGNT
|
223 |
+
TGNC
|
224 |
+
TGNG
|
225 |
+
TGNN
|
226 |
+
TNAA
|
227 |
+
TNAT
|
228 |
+
TNAC
|
229 |
+
TNAG
|
230 |
+
TNAN
|
231 |
+
TNTA
|
232 |
+
TNTT
|
233 |
+
TNTC
|
234 |
+
TNTG
|
235 |
+
TNTN
|
236 |
+
TNCA
|
237 |
+
TNCT
|
238 |
+
TNCC
|
239 |
+
TNCG
|
240 |
+
TNCN
|
241 |
+
TNGA
|
242 |
+
TNGT
|
243 |
+
TNGC
|
244 |
+
TNGG
|
245 |
+
TNGN
|
246 |
+
TNNA
|
247 |
+
TNNT
|
248 |
+
TNNC
|
249 |
+
TNNG
|
250 |
+
TNNN
|
251 |
+
CAAA
|
252 |
+
CAAT
|
253 |
+
CAAC
|
254 |
+
CAAG
|
255 |
+
CAAN
|
256 |
+
CATA
|
257 |
+
CATT
|
258 |
+
CATC
|
259 |
+
CATG
|
260 |
+
CATN
|
261 |
+
CACA
|
262 |
+
CACT
|
263 |
+
CACC
|
264 |
+
CACG
|
265 |
+
CACN
|
266 |
+
CAGA
|
267 |
+
CAGT
|
268 |
+
CAGC
|
269 |
+
CAGG
|
270 |
+
CAGN
|
271 |
+
CANA
|
272 |
+
CANT
|
273 |
+
CANC
|
274 |
+
CANG
|
275 |
+
CANN
|
276 |
+
CTAA
|
277 |
+
CTAT
|
278 |
+
CTAC
|
279 |
+
CTAG
|
280 |
+
CTAN
|
281 |
+
CTTA
|
282 |
+
CTTT
|
283 |
+
CTTC
|
284 |
+
CTTG
|
285 |
+
CTTN
|
286 |
+
CTCA
|
287 |
+
CTCT
|
288 |
+
CTCC
|
289 |
+
CTCG
|
290 |
+
CTCN
|
291 |
+
CTGA
|
292 |
+
CTGT
|
293 |
+
CTGC
|
294 |
+
CTGG
|
295 |
+
CTGN
|
296 |
+
CTNA
|
297 |
+
CTNT
|
298 |
+
CTNC
|
299 |
+
CTNG
|
300 |
+
CTNN
|
301 |
+
CCAA
|
302 |
+
CCAT
|
303 |
+
CCAC
|
304 |
+
CCAG
|
305 |
+
CCAN
|
306 |
+
CCTA
|
307 |
+
CCTT
|
308 |
+
CCTC
|
309 |
+
CCTG
|
310 |
+
CCTN
|
311 |
+
CCCA
|
312 |
+
CCCT
|
313 |
+
CCCC
|
314 |
+
CCCG
|
315 |
+
CCCN
|
316 |
+
CCGA
|
317 |
+
CCGT
|
318 |
+
CCGC
|
319 |
+
CCGG
|
320 |
+
CCGN
|
321 |
+
CCNA
|
322 |
+
CCNT
|
323 |
+
CCNC
|
324 |
+
CCNG
|
325 |
+
CCNN
|
326 |
+
CGAA
|
327 |
+
CGAT
|
328 |
+
CGAC
|
329 |
+
CGAG
|
330 |
+
CGAN
|
331 |
+
CGTA
|
332 |
+
CGTT
|
333 |
+
CGTC
|
334 |
+
CGTG
|
335 |
+
CGTN
|
336 |
+
CGCA
|
337 |
+
CGCT
|
338 |
+
CGCC
|
339 |
+
CGCG
|
340 |
+
CGCN
|
341 |
+
CGGA
|
342 |
+
CGGT
|
343 |
+
CGGC
|
344 |
+
CGGG
|
345 |
+
CGGN
|
346 |
+
CGNA
|
347 |
+
CGNT
|
348 |
+
CGNC
|
349 |
+
CGNG
|
350 |
+
CGNN
|
351 |
+
CNAA
|
352 |
+
CNAT
|
353 |
+
CNAC
|
354 |
+
CNAG
|
355 |
+
CNAN
|
356 |
+
CNTA
|
357 |
+
CNTT
|
358 |
+
CNTC
|
359 |
+
CNTG
|
360 |
+
CNTN
|
361 |
+
CNCA
|
362 |
+
CNCT
|
363 |
+
CNCC
|
364 |
+
CNCG
|
365 |
+
CNCN
|
366 |
+
CNGA
|
367 |
+
CNGT
|
368 |
+
CNGC
|
369 |
+
CNGG
|
370 |
+
CNGN
|
371 |
+
CNNA
|
372 |
+
CNNT
|
373 |
+
CNNC
|
374 |
+
CNNG
|
375 |
+
CNNN
|
376 |
+
GAAA
|
377 |
+
GAAT
|
378 |
+
GAAC
|
379 |
+
GAAG
|
380 |
+
GAAN
|
381 |
+
GATA
|
382 |
+
GATT
|
383 |
+
GATC
|
384 |
+
GATG
|
385 |
+
GATN
|
386 |
+
GACA
|
387 |
+
GACT
|
388 |
+
GACC
|
389 |
+
GACG
|
390 |
+
GACN
|
391 |
+
GAGA
|
392 |
+
GAGT
|
393 |
+
GAGC
|
394 |
+
GAGG
|
395 |
+
GAGN
|
396 |
+
GANA
|
397 |
+
GANT
|
398 |
+
GANC
|
399 |
+
GANG
|
400 |
+
GANN
|
401 |
+
GTAA
|
402 |
+
GTAT
|
403 |
+
GTAC
|
404 |
+
GTAG
|
405 |
+
GTAN
|
406 |
+
GTTA
|
407 |
+
GTTT
|
408 |
+
GTTC
|
409 |
+
GTTG
|
410 |
+
GTTN
|
411 |
+
GTCA
|
412 |
+
GTCT
|
413 |
+
GTCC
|
414 |
+
GTCG
|
415 |
+
GTCN
|
416 |
+
GTGA
|
417 |
+
GTGT
|
418 |
+
GTGC
|
419 |
+
GTGG
|
420 |
+
GTGN
|
421 |
+
GTNA
|
422 |
+
GTNT
|
423 |
+
GTNC
|
424 |
+
GTNG
|
425 |
+
GTNN
|
426 |
+
GCAA
|
427 |
+
GCAT
|
428 |
+
GCAC
|
429 |
+
GCAG
|
430 |
+
GCAN
|
431 |
+
GCTA
|
432 |
+
GCTT
|
433 |
+
GCTC
|
434 |
+
GCTG
|
435 |
+
GCTN
|
436 |
+
GCCA
|
437 |
+
GCCT
|
438 |
+
GCCC
|
439 |
+
GCCG
|
440 |
+
GCCN
|
441 |
+
GCGA
|
442 |
+
GCGT
|
443 |
+
GCGC
|
444 |
+
GCGG
|
445 |
+
GCGN
|
446 |
+
GCNA
|
447 |
+
GCNT
|
448 |
+
GCNC
|
449 |
+
GCNG
|
450 |
+
GCNN
|
451 |
+
GGAA
|
452 |
+
GGAT
|
453 |
+
GGAC
|
454 |
+
GGAG
|
455 |
+
GGAN
|
456 |
+
GGTA
|
457 |
+
GGTT
|
458 |
+
GGTC
|
459 |
+
GGTG
|
460 |
+
GGTN
|
461 |
+
GGCA
|
462 |
+
GGCT
|
463 |
+
GGCC
|
464 |
+
GGCG
|
465 |
+
GGCN
|
466 |
+
GGGA
|
467 |
+
GGGT
|
468 |
+
GGGC
|
469 |
+
GGGG
|
470 |
+
GGGN
|
471 |
+
GGNA
|
472 |
+
GGNT
|
473 |
+
GGNC
|
474 |
+
GGNG
|
475 |
+
GGNN
|
476 |
+
GNAA
|
477 |
+
GNAT
|
478 |
+
GNAC
|
479 |
+
GNAG
|
480 |
+
GNAN
|
481 |
+
GNTA
|
482 |
+
GNTT
|
483 |
+
GNTC
|
484 |
+
GNTG
|
485 |
+
GNTN
|
486 |
+
GNCA
|
487 |
+
GNCT
|
488 |
+
GNCC
|
489 |
+
GNCG
|
490 |
+
GNCN
|
491 |
+
GNGA
|
492 |
+
GNGT
|
493 |
+
GNGC
|
494 |
+
GNGG
|
495 |
+
GNGN
|
496 |
+
GNNA
|
497 |
+
GNNT
|
498 |
+
GNNC
|
499 |
+
GNNG
|
500 |
+
GNNN
|
501 |
+
NAAA
|
502 |
+
NAAT
|
503 |
+
NAAC
|
504 |
+
NAAG
|
505 |
+
NAAN
|
506 |
+
NATA
|
507 |
+
NATT
|
508 |
+
NATC
|
509 |
+
NATG
|
510 |
+
NATN
|
511 |
+
NACA
|
512 |
+
NACT
|
513 |
+
NACC
|
514 |
+
NACG
|
515 |
+
NACN
|
516 |
+
NAGA
|
517 |
+
NAGT
|
518 |
+
NAGC
|
519 |
+
NAGG
|
520 |
+
NAGN
|
521 |
+
NANA
|
522 |
+
NANT
|
523 |
+
NANC
|
524 |
+
NANG
|
525 |
+
NANN
|
526 |
+
NTAA
|
527 |
+
NTAT
|
528 |
+
NTAC
|
529 |
+
NTAG
|
530 |
+
NTAN
|
531 |
+
NTTA
|
532 |
+
NTTT
|
533 |
+
NTTC
|
534 |
+
NTTG
|
535 |
+
NTTN
|
536 |
+
NTCA
|
537 |
+
NTCT
|
538 |
+
NTCC
|
539 |
+
NTCG
|
540 |
+
NTCN
|
541 |
+
NTGA
|
542 |
+
NTGT
|
543 |
+
NTGC
|
544 |
+
NTGG
|
545 |
+
NTGN
|
546 |
+
NTNA
|
547 |
+
NTNT
|
548 |
+
NTNC
|
549 |
+
NTNG
|
550 |
+
NTNN
|
551 |
+
NCAA
|
552 |
+
NCAT
|
553 |
+
NCAC
|
554 |
+
NCAG
|
555 |
+
NCAN
|
556 |
+
NCTA
|
557 |
+
NCTT
|
558 |
+
NCTC
|
559 |
+
NCTG
|
560 |
+
NCTN
|
561 |
+
NCCA
|
562 |
+
NCCT
|
563 |
+
NCCC
|
564 |
+
NCCG
|
565 |
+
NCCN
|
566 |
+
NCGA
|
567 |
+
NCGT
|
568 |
+
NCGC
|
569 |
+
NCGG
|
570 |
+
NCGN
|
571 |
+
NCNA
|
572 |
+
NCNT
|
573 |
+
NCNC
|
574 |
+
NCNG
|
575 |
+
NCNN
|
576 |
+
NGAA
|
577 |
+
NGAT
|
578 |
+
NGAC
|
579 |
+
NGAG
|
580 |
+
NGAN
|
581 |
+
NGTA
|
582 |
+
NGTT
|
583 |
+
NGTC
|
584 |
+
NGTG
|
585 |
+
NGTN
|
586 |
+
NGCA
|
587 |
+
NGCT
|
588 |
+
NGCC
|
589 |
+
NGCG
|
590 |
+
NGCN
|
591 |
+
NGGA
|
592 |
+
NGGT
|
593 |
+
NGGC
|
594 |
+
NGGG
|
595 |
+
NGGN
|
596 |
+
NGNA
|
597 |
+
NGNT
|
598 |
+
NGNC
|
599 |
+
NGNG
|
600 |
+
NGNN
|
601 |
+
NNAA
|
602 |
+
NNAT
|
603 |
+
NNAC
|
604 |
+
NNAG
|
605 |
+
NNAN
|
606 |
+
NNTA
|
607 |
+
NNTT
|
608 |
+
NNTC
|
609 |
+
NNTG
|
610 |
+
NNTN
|
611 |
+
NNCA
|
612 |
+
NNCT
|
613 |
+
NNCC
|
614 |
+
NNCG
|
615 |
+
NNCN
|
616 |
+
NNGA
|
617 |
+
NNGT
|
618 |
+
NNGC
|
619 |
+
NNGG
|
620 |
+
NNGN
|
621 |
+
NNNA
|
622 |
+
NNNT
|
623 |
+
NNNC
|
624 |
+
NNNG
|
625 |
+
NNNN
|
626 |
+
[MASK]
|
627 |
+
[CLS]
|
628 |
+
[PAD]
|
629 |
+
[SEP]
|
630 |
+
[UNK]
|