Lakoc commited on
Commit
6c93321
·
verified ·
1 Parent(s): cb13848

Upload tokenizer

Browse files
Files changed (3) hide show
  1. special_tokens_map.json +5 -5
  2. tokenizer.json +35 -32
  3. tokenizer_config.json +23 -15
special_tokens_map.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
- "bos_token": "(BOS)",
3
- "eos_token": "(EOS)",
4
- "mask_token": "(MASK)",
5
- "pad_token": "(PAD)",
6
- "unk_token": "(UNK)"
7
  }
 
1
  {
2
+ "bos_token": "([bos])",
3
+ "eos_token": "([eos])",
4
+ "mask_token": "([mask])",
5
+ "pad_token": "([pad])",
6
+ "unk_token": "([unk])"
7
  }
tokenizer.json CHANGED
@@ -5,7 +5,7 @@
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
- "content": "(BOS)",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
@@ -14,7 +14,7 @@
14
  },
15
  {
16
  "id": 1,
17
- "content": "(EOS)",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
@@ -23,16 +23,16 @@
23
  },
24
  {
25
  "id": 2,
26
- "content": "(UNK)",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
- "normalized": true,
31
- "special": false
32
  },
33
  {
34
  "id": 3,
35
- "content": "(PAD)",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
@@ -41,7 +41,7 @@
41
  },
42
  {
43
  "id": 4,
44
- "content": "(MASK)",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
@@ -49,7 +49,7 @@
49
  "special": true
50
  },
51
  {
52
- "id": 1000,
53
  "content": "(LNG)",
54
  "single_word": false,
55
  "lstrip": false,
@@ -58,7 +58,16 @@
58
  "special": false
59
  },
60
  {
61
- "id": 1001,
 
 
 
 
 
 
 
 
 
62
  "content": "(SPN)",
63
  "single_word": false,
64
  "lstrip": false,
@@ -85,7 +94,7 @@
85
  },
86
  {
87
  "SpecialToken": {
88
- "id": "(EOS)",
89
  "type_id": 0
90
  }
91
  }
@@ -99,7 +108,7 @@
99
  },
100
  {
101
  "SpecialToken": {
102
- "id": "(EOS)",
103
  "type_id": 0
104
  }
105
  },
@@ -111,28 +120,28 @@
111
  },
112
  {
113
  "SpecialToken": {
114
- "id": "(EOS)",
115
  "type_id": 1
116
  }
117
  }
118
  ],
119
  "special_tokens": {
120
- "(BOS)": {
121
- "id": "(BOS)",
122
  "ids": [
123
  0
124
  ],
125
  "tokens": [
126
- "(BOS)"
127
  ]
128
  },
129
- "(EOS)": {
130
- "id": "(EOS)",
131
  "ids": [
132
  1
133
  ],
134
  "tokens": [
135
- "(EOS)"
136
  ]
137
  }
138
  }
@@ -146,17 +155,17 @@
146
  "model": {
147
  "type": "BPE",
148
  "dropout": null,
149
- "unk_token": "(UNK)",
150
  "continuing_subword_prefix": null,
151
  "end_of_word_suffix": null,
152
  "fuse_unk": false,
153
  "byte_fallback": false,
154
  "vocab": {
155
- "(BOS)": 0,
156
- "(EOS)": 1,
157
- "(UNK)": 2,
158
- "(PAD)": 3,
159
- "(MASK)": 4,
160
  "!": 5,
161
  "%": 6,
162
  "'": 7,
@@ -1148,10 +1157,7 @@
1148
  "Ġznamen": 993,
1149
  "vý": 994,
1150
  "ovala": 995,
1151
- "rych": 996,
1152
- "cká": 997,
1153
- "ĠJ": 998,
1154
- "Ġdisku": 999
1155
  },
1156
  "merges": [
1157
  "Ã Ń",
@@ -2011,10 +2017,7 @@
2011
  "Ġzna men",
2012
  "v ý",
2013
  "ova la",
2014
- "ry ch",
2015
- "ck á",
2016
- "Ġ J",
2017
- "Ġdi sku"
2018
  ]
2019
  }
2020
  }
 
5
  "added_tokens": [
6
  {
7
  "id": 0,
8
+ "content": "([bos])",
9
  "single_word": false,
10
  "lstrip": false,
11
  "rstrip": false,
 
14
  },
15
  {
16
  "id": 1,
17
+ "content": "([eos])",
18
  "single_word": false,
19
  "lstrip": false,
20
  "rstrip": false,
 
23
  },
24
  {
25
  "id": 2,
26
+ "content": "([unk])",
27
  "single_word": false,
28
  "lstrip": false,
29
  "rstrip": false,
30
+ "normalized": false,
31
+ "special": true
32
  },
33
  {
34
  "id": 3,
35
+ "content": "([pad])",
36
  "single_word": false,
37
  "lstrip": false,
38
  "rstrip": false,
 
41
  },
42
  {
43
  "id": 4,
44
+ "content": "([mask])",
45
  "single_word": false,
46
  "lstrip": false,
47
  "rstrip": false,
 
49
  "special": true
50
  },
51
  {
52
+ "id": 997,
53
  "content": "(LNG)",
54
  "single_word": false,
55
  "lstrip": false,
 
58
  "special": false
59
  },
60
  {
61
+ "id": 998,
62
+ "content": "(UNK)",
63
+ "single_word": false,
64
+ "lstrip": false,
65
+ "rstrip": false,
66
+ "normalized": true,
67
+ "special": false
68
+ },
69
+ {
70
+ "id": 999,
71
  "content": "(SPN)",
72
  "single_word": false,
73
  "lstrip": false,
 
94
  },
95
  {
96
  "SpecialToken": {
97
+ "id": "([eos])",
98
  "type_id": 0
99
  }
100
  }
 
108
  },
109
  {
110
  "SpecialToken": {
111
+ "id": "([eos])",
112
  "type_id": 0
113
  }
114
  },
 
120
  },
121
  {
122
  "SpecialToken": {
123
+ "id": "([eos])",
124
  "type_id": 1
125
  }
126
  }
127
  ],
128
  "special_tokens": {
129
+ "([bos])": {
130
+ "id": "([bos])",
131
  "ids": [
132
  0
133
  ],
134
  "tokens": [
135
+ "([bos])"
136
  ]
137
  },
138
+ "([eos])": {
139
+ "id": "([eos])",
140
  "ids": [
141
  1
142
  ],
143
  "tokens": [
144
+ "([eos])"
145
  ]
146
  }
147
  }
 
155
  "model": {
156
  "type": "BPE",
157
  "dropout": null,
158
+ "unk_token": "([unk])",
159
  "continuing_subword_prefix": null,
160
  "end_of_word_suffix": null,
161
  "fuse_unk": false,
162
  "byte_fallback": false,
163
  "vocab": {
164
+ "([bos])": 0,
165
+ "([eos])": 1,
166
+ "([unk])": 2,
167
+ "([pad])": 3,
168
+ "([mask])": 4,
169
  "!": 5,
170
  "%": 6,
171
  "'": 7,
 
1157
  "Ġznamen": 993,
1158
  "vý": 994,
1159
  "ovala": 995,
1160
+ "rych": 996
 
 
 
1161
  },
1162
  "merges": [
1163
  "Ã Ń",
 
2017
  "Ġzna men",
2018
  "v ý",
2019
  "ova la",
2020
+ "ry ch"
 
 
 
2021
  ]
2022
  }
2023
  }
tokenizer_config.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
- "content": "(BOS)",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
@@ -9,7 +9,7 @@
9
  "special": true
10
  },
11
  "1": {
12
- "content": "(EOS)",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
@@ -17,15 +17,15 @@
17
  "special": true
18
  },
19
  "2": {
20
- "content": "(UNK)",
21
  "lstrip": false,
22
- "normalized": true,
23
  "rstrip": false,
24
  "single_word": false,
25
- "special": false
26
  },
27
  "3": {
28
- "content": "(PAD)",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
@@ -33,14 +33,14 @@
33
  "special": true
34
  },
35
  "4": {
36
- "content": "(MASK)",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
- "1000": {
44
  "content": "(LNG)",
45
  "lstrip": false,
46
  "normalized": true,
@@ -48,7 +48,15 @@
48
  "single_word": false,
49
  "special": false
50
  },
51
- "1001": {
 
 
 
 
 
 
 
 
52
  "content": "(SPN)",
53
  "lstrip": false,
54
  "normalized": true,
@@ -57,12 +65,12 @@
57
  "special": false
58
  }
59
  },
60
- "bos_token": "(BOS)",
61
- "clean_up_tokenization_spaces": false,
62
- "eos_token": "(EOS)",
63
- "mask_token": "(MASK)",
64
  "model_max_length": 1000000000000000019884624838656,
65
- "pad_token": "(PAD)",
66
  "tokenizer_class": "PreTrainedTokenizerFast",
67
- "unk_token": "(UNK)"
68
  }
 
1
  {
2
  "added_tokens_decoder": {
3
  "0": {
4
+ "content": "([bos])",
5
  "lstrip": false,
6
  "normalized": false,
7
  "rstrip": false,
 
9
  "special": true
10
  },
11
  "1": {
12
+ "content": "([eos])",
13
  "lstrip": false,
14
  "normalized": false,
15
  "rstrip": false,
 
17
  "special": true
18
  },
19
  "2": {
20
+ "content": "([unk])",
21
  "lstrip": false,
22
+ "normalized": false,
23
  "rstrip": false,
24
  "single_word": false,
25
+ "special": true
26
  },
27
  "3": {
28
+ "content": "([pad])",
29
  "lstrip": false,
30
  "normalized": false,
31
  "rstrip": false,
 
33
  "special": true
34
  },
35
  "4": {
36
+ "content": "([mask])",
37
  "lstrip": false,
38
  "normalized": false,
39
  "rstrip": false,
40
  "single_word": false,
41
  "special": true
42
  },
43
+ "997": {
44
  "content": "(LNG)",
45
  "lstrip": false,
46
  "normalized": true,
 
48
  "single_word": false,
49
  "special": false
50
  },
51
+ "998": {
52
+ "content": "(UNK)",
53
+ "lstrip": false,
54
+ "normalized": true,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": false
58
+ },
59
+ "999": {
60
  "content": "(SPN)",
61
  "lstrip": false,
62
  "normalized": true,
 
65
  "special": false
66
  }
67
  },
68
+ "bos_token": "([bos])",
69
+ "clean_up_tokenization_spaces": true,
70
+ "eos_token": "([eos])",
71
+ "mask_token": "([mask])",
72
  "model_max_length": 1000000000000000019884624838656,
73
+ "pad_token": "([pad])",
74
  "tokenizer_class": "PreTrainedTokenizerFast",
75
+ "unk_token": "([unk])"
76
  }