mlinmg commited on
Commit
9cb73ec
·
verified ·
1 Parent(s): fea9a16

Upload tokenizer

Browse files
Files changed (3) hide show
  1. added_tokens.json +10 -1
  2. tokenizer.json +82 -1
  3. tokenizer_config.json +73 -1
added_tokens.json CHANGED
@@ -3,5 +3,14 @@
3
  "<BOLG>": 50267,
4
  "<EOBG>": 50266,
5
  "<EOLG>": 50268,
6
- "<STOPG>": 50269
 
 
 
 
 
 
 
 
 
7
  }
 
3
  "<BOLG>": 50267,
4
  "<EOBG>": 50266,
5
  "<EOLG>": 50268,
6
+ "<de>": 50278,
7
+ "<en>": 50272,
8
+ "<es>": 50277,
9
+ "<fr>": 50276,
10
+ "<it>": 50271,
11
+ "<ja>": 50275,
12
+ "<ko>": 50274,
13
+ "<music>": 50269,
14
+ "<voice>": 50270,
15
+ "<zh>": 50273
16
  }
tokenizer.json CHANGED
@@ -93,7 +93,88 @@
93
  },
94
  {
95
  "id": 50269,
96
- "content": "<STOPG>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
  "single_word": false,
98
  "lstrip": false,
99
  "rstrip": false,
 
93
  },
94
  {
95
  "id": 50269,
96
+ "content": "<music>",
97
+ "single_word": false,
98
+ "lstrip": false,
99
+ "rstrip": false,
100
+ "normalized": false,
101
+ "special": true
102
+ },
103
+ {
104
+ "id": 50270,
105
+ "content": "<voice>",
106
+ "single_word": false,
107
+ "lstrip": false,
108
+ "rstrip": false,
109
+ "normalized": false,
110
+ "special": true
111
+ },
112
+ {
113
+ "id": 50271,
114
+ "content": "<it>",
115
+ "single_word": false,
116
+ "lstrip": false,
117
+ "rstrip": false,
118
+ "normalized": false,
119
+ "special": true
120
+ },
121
+ {
122
+ "id": 50272,
123
+ "content": "<en>",
124
+ "single_word": false,
125
+ "lstrip": false,
126
+ "rstrip": false,
127
+ "normalized": false,
128
+ "special": true
129
+ },
130
+ {
131
+ "id": 50273,
132
+ "content": "<zh>",
133
+ "single_word": false,
134
+ "lstrip": false,
135
+ "rstrip": false,
136
+ "normalized": false,
137
+ "special": true
138
+ },
139
+ {
140
+ "id": 50274,
141
+ "content": "<ko>",
142
+ "single_word": false,
143
+ "lstrip": false,
144
+ "rstrip": false,
145
+ "normalized": false,
146
+ "special": true
147
+ },
148
+ {
149
+ "id": 50275,
150
+ "content": "<ja>",
151
+ "single_word": false,
152
+ "lstrip": false,
153
+ "rstrip": false,
154
+ "normalized": false,
155
+ "special": true
156
+ },
157
+ {
158
+ "id": 50276,
159
+ "content": "<fr>",
160
+ "single_word": false,
161
+ "lstrip": false,
162
+ "rstrip": false,
163
+ "normalized": false,
164
+ "special": true
165
+ },
166
+ {
167
+ "id": 50277,
168
+ "content": "<es>",
169
+ "single_word": false,
170
+ "lstrip": false,
171
+ "rstrip": false,
172
+ "normalized": false,
173
+ "special": true
174
+ },
175
+ {
176
+ "id": 50278,
177
+ "content": "<de>",
178
  "single_word": false,
179
  "lstrip": false,
180
  "rstrip": false,
tokenizer_config.json CHANGED
@@ -74,7 +74,79 @@
74
  "special": true
75
  },
76
  "50269": {
77
- "content": "<STOPG>",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  "lstrip": false,
79
  "normalized": false,
80
  "rstrip": false,
 
74
  "special": true
75
  },
76
  "50269": {
77
+ "content": "<music>",
78
+ "lstrip": false,
79
+ "normalized": false,
80
+ "rstrip": false,
81
+ "single_word": false,
82
+ "special": true
83
+ },
84
+ "50270": {
85
+ "content": "<voice>",
86
+ "lstrip": false,
87
+ "normalized": false,
88
+ "rstrip": false,
89
+ "single_word": false,
90
+ "special": true
91
+ },
92
+ "50271": {
93
+ "content": "<it>",
94
+ "lstrip": false,
95
+ "normalized": false,
96
+ "rstrip": false,
97
+ "single_word": false,
98
+ "special": true
99
+ },
100
+ "50272": {
101
+ "content": "<en>",
102
+ "lstrip": false,
103
+ "normalized": false,
104
+ "rstrip": false,
105
+ "single_word": false,
106
+ "special": true
107
+ },
108
+ "50273": {
109
+ "content": "<zh>",
110
+ "lstrip": false,
111
+ "normalized": false,
112
+ "rstrip": false,
113
+ "single_word": false,
114
+ "special": true
115
+ },
116
+ "50274": {
117
+ "content": "<ko>",
118
+ "lstrip": false,
119
+ "normalized": false,
120
+ "rstrip": false,
121
+ "single_word": false,
122
+ "special": true
123
+ },
124
+ "50275": {
125
+ "content": "<ja>",
126
+ "lstrip": false,
127
+ "normalized": false,
128
+ "rstrip": false,
129
+ "single_word": false,
130
+ "special": true
131
+ },
132
+ "50276": {
133
+ "content": "<fr>",
134
+ "lstrip": false,
135
+ "normalized": false,
136
+ "rstrip": false,
137
+ "single_word": false,
138
+ "special": true
139
+ },
140
+ "50277": {
141
+ "content": "<es>",
142
+ "lstrip": false,
143
+ "normalized": false,
144
+ "rstrip": false,
145
+ "single_word": false,
146
+ "special": true
147
+ },
148
+ "50278": {
149
+ "content": "<de>",
150
  "lstrip": false,
151
  "normalized": false,
152
  "rstrip": false,