queenVdu13 commited on
Commit
453a88b
·
verified ·
1 Parent(s): 820dac0

Upload tokenizer

Browse files
Files changed (5) hide show
  1. merges.txt +0 -0
  2. special_tokens_map.json +6 -10
  3. tokenizer.json +0 -0
  4. tokenizer_config.json +11 -37
  5. vocab.json +0 -0
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json CHANGED
@@ -1,26 +1,22 @@
1
  {
2
- "additional_special_tokens": [
3
- "<s>NOTUSED",
4
- "</s>NOTUSED"
5
- ],
6
  "bos_token": {
7
  "content": "<s>",
8
  "lstrip": false,
9
- "normalized": false,
10
  "rstrip": false,
11
  "single_word": false
12
  },
13
  "cls_token": {
14
  "content": "<s>",
15
  "lstrip": false,
16
- "normalized": false,
17
  "rstrip": false,
18
  "single_word": false
19
  },
20
  "eos_token": {
21
  "content": "</s>",
22
  "lstrip": false,
23
- "normalized": false,
24
  "rstrip": false,
25
  "single_word": false
26
  },
@@ -34,21 +30,21 @@
34
  "pad_token": {
35
  "content": "<pad>",
36
  "lstrip": false,
37
- "normalized": false,
38
  "rstrip": false,
39
  "single_word": false
40
  },
41
  "sep_token": {
42
  "content": "</s>",
43
  "lstrip": false,
44
- "normalized": false,
45
  "rstrip": false,
46
  "single_word": false
47
  },
48
  "unk_token": {
49
  "content": "<unk>",
50
  "lstrip": false,
51
- "normalized": false,
52
  "rstrip": false,
53
  "single_word": false
54
  }
 
1
  {
 
 
 
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
5
+ "normalized": true,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "cls_token": {
10
  "content": "<s>",
11
  "lstrip": false,
12
+ "normalized": true,
13
  "rstrip": false,
14
  "single_word": false
15
  },
16
  "eos_token": {
17
  "content": "</s>",
18
  "lstrip": false,
19
+ "normalized": true,
20
  "rstrip": false,
21
  "single_word": false
22
  },
 
30
  "pad_token": {
31
  "content": "<pad>",
32
  "lstrip": false,
33
+ "normalized": true,
34
  "rstrip": false,
35
  "single_word": false
36
  },
37
  "sep_token": {
38
  "content": "</s>",
39
  "lstrip": false,
40
+ "normalized": true,
41
  "rstrip": false,
42
  "single_word": false
43
  },
44
  "unk_token": {
45
  "content": "<unk>",
46
  "lstrip": false,
47
+ "normalized": true,
48
  "rstrip": false,
49
  "single_word": false
50
  }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1,9 +1,10 @@
1
  {
 
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "<s>NOTUSED",
5
  "lstrip": false,
6
- "normalized": false,
7
  "rstrip": false,
8
  "single_word": false,
9
  "special": true
@@ -11,15 +12,15 @@
11
  "1": {
12
  "content": "<pad>",
13
  "lstrip": false,
14
- "normalized": false,
15
  "rstrip": false,
16
  "single_word": false,
17
  "special": true
18
  },
19
  "2": {
20
- "content": "</s>NOTUSED",
21
  "lstrip": false,
22
- "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
@@ -27,36 +28,12 @@
27
  "3": {
28
  "content": "<unk>",
29
  "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "<unk>NOTUSED",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
- },
43
- "5": {
44
- "content": "<s>",
45
- "lstrip": false,
46
- "normalized": false,
47
- "rstrip": false,
48
- "single_word": false,
49
- "special": true
50
- },
51
- "6": {
52
- "content": "</s>",
53
- "lstrip": false,
54
- "normalized": false,
55
  "rstrip": false,
56
  "single_word": false,
57
  "special": true
58
  },
59
- "32004": {
60
  "content": "<mask>",
61
  "lstrip": true,
62
  "normalized": false,
@@ -65,14 +42,11 @@
65
  "special": true
66
  }
67
  },
68
- "additional_special_tokens": [
69
- "<s>NOTUSED",
70
- "</s>NOTUSED"
71
- ],
72
  "bos_token": "<s>",
73
  "clean_up_tokenization_spaces": false,
74
  "cls_token": "<s>",
75
  "eos_token": "</s>",
 
76
  "mask_token": "<mask>",
77
  "max_length": 128,
78
  "model_max_length": 512,
@@ -81,9 +55,9 @@
81
  "pad_token_type_id": 0,
82
  "padding_side": "right",
83
  "sep_token": "</s>",
84
- "sp_model_kwargs": {},
85
  "stride": 0,
86
- "tokenizer_class": "CamembertTokenizer",
 
87
  "truncation_side": "right",
88
  "truncation_strategy": "longest_first",
89
  "unk_token": "<unk>"
 
1
  {
2
+ "add_prefix_space": false,
3
  "added_tokens_decoder": {
4
  "0": {
5
+ "content": "<s>",
6
  "lstrip": false,
7
+ "normalized": true,
8
  "rstrip": false,
9
  "single_word": false,
10
  "special": true
 
12
  "1": {
13
  "content": "<pad>",
14
  "lstrip": false,
15
+ "normalized": true,
16
  "rstrip": false,
17
  "single_word": false,
18
  "special": true
19
  },
20
  "2": {
21
+ "content": "</s>",
22
  "lstrip": false,
23
+ "normalized": true,
24
  "rstrip": false,
25
  "single_word": false,
26
  "special": true
 
28
  "3": {
29
  "content": "<unk>",
30
  "lstrip": false,
31
+ "normalized": true,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  "rstrip": false,
33
  "single_word": false,
34
  "special": true
35
  },
36
+ "50264": {
37
  "content": "<mask>",
38
  "lstrip": true,
39
  "normalized": false,
 
42
  "special": true
43
  }
44
  },
 
 
 
 
45
  "bos_token": "<s>",
46
  "clean_up_tokenization_spaces": false,
47
  "cls_token": "<s>",
48
  "eos_token": "</s>",
49
+ "errors": "replace",
50
  "mask_token": "<mask>",
51
  "max_length": 128,
52
  "model_max_length": 512,
 
55
  "pad_token_type_id": 0,
56
  "padding_side": "right",
57
  "sep_token": "</s>",
 
58
  "stride": 0,
59
+ "tokenizer_class": "RobertaTokenizer",
60
+ "trim_offsets": true,
61
  "truncation_side": "right",
62
  "truncation_strategy": "longest_first",
63
  "unk_token": "<unk>"
vocab.json ADDED
The diff for this file is too large to render. See raw diff