Muennighoff commited on
Commit
9867b99
1 Parent(s): 3a42dfc

Fix tokenizer

Browse files
added_tokens.json CHANGED
@@ -1,10 +1,7 @@
1
  {
2
- "<im_col>": 151649,
3
- "<im_end>": 151647,
4
- "<im_patch>": 151648,
5
- "<im_start>": 151646,
6
- "<|endoftext|>": 151643,
7
- "<|im_end|>": 151645,
8
- "<|im_start|>": 151644,
9
- "<|image|>": 151650
10
  }
 
1
  {
2
+ "<im_col>": 100281,
3
+ "<im_end>": 100279,
4
+ "<im_patch>": 100280,
5
+ "<im_start>": 100278,
6
+ "<|image|>": 100282
 
 
 
7
  }
special_tokens_map.json CHANGED
@@ -1,4 +1,37 @@
1
  {
2
- "eos_token": "<|endoftext|>",
3
- "pad_token": "<|padding|>"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  }
 
1
  {
2
+ "additional_special_tokens": [
3
+ "<im_start>",
4
+ "<im_end>",
5
+ "<im_patch>",
6
+ "<im_col>",
7
+ "<|image|>"
8
+ ],
9
+ "bos_token": {
10
+ "content": "<|endoftext|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "eos_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "pad_token": {
24
+ "content": "<|pad|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "<|endoftext|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
  }
tokenizer.json CHANGED
@@ -254,6 +254,51 @@
254
  "rstrip": false,
255
  "normalized": false,
256
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
  }
258
  ],
259
  "normalizer": {
 
254
  "rstrip": false,
255
  "normalized": false,
256
  "special": true
257
+ },
258
+ {
259
+ "id": 50280,
260
+ "content": "<im_start>",
261
+ "single_word": false,
262
+ "lstrip": false,
263
+ "rstrip": false,
264
+ "normalized": false,
265
+ "special": true
266
+ },
267
+ {
268
+ "id": 50281,
269
+ "content": "<im_end>",
270
+ "single_word": false,
271
+ "lstrip": false,
272
+ "rstrip": false,
273
+ "normalized": false,
274
+ "special": true
275
+ },
276
+ {
277
+ "id": 50282,
278
+ "content": "<im_patch>",
279
+ "single_word": false,
280
+ "lstrip": false,
281
+ "rstrip": false,
282
+ "normalized": false,
283
+ "special": true
284
+ },
285
+ {
286
+ "id": 50283,
287
+ "content": "<im_col>",
288
+ "single_word": false,
289
+ "lstrip": false,
290
+ "rstrip": false,
291
+ "normalized": false,
292
+ "special": true
293
+ },
294
+ {
295
+ "id": 50284,
296
+ "content": "<|image|>",
297
+ "single_word": false,
298
+ "lstrip": false,
299
+ "rstrip": false,
300
+ "normalized": false,
301
+ "special": true
302
  }
303
  ],
304
  "normalizer": {
tokenizer_config.json CHANGED
@@ -226,7 +226,47 @@
226
  "rstrip": false,
227
  "single_word": false,
228
  "special": true
229
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  },
231
  "bos_token": null,
232
  "clean_up_tokenization_spaces": true,
@@ -235,4 +275,4 @@
235
  "pad_token": "<|padding|>",
236
  "tokenizer_class": "GPTNeoXTokenizer",
237
  "unk_token": null
238
- }
 
226
  "rstrip": false,
227
  "single_word": false,
228
  "special": true
229
+ },
230
+ "50280": {
231
+ "content": "<im_start>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "50281": {
239
+ "content": "<im_end>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "50282": {
247
+ "content": "<im_patch>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "50283": {
255
+ "content": "<im_col>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "50284": {
263
+ "content": "<|image|>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ }
270
  },
271
  "bos_token": null,
272
  "clean_up_tokenization_spaces": true,
 
275
  "pad_token": "<|padding|>",
276
  "tokenizer_class": "GPTNeoXTokenizer",
277
  "unk_token": null
278
+ }
vocab.json DELETED
The diff for this file is too large to render. See raw diff