bishaltwr commited on
Commit
6cd6645
·
verified ·
1 Parent(s): f831d4a

Upload tokenizer

Browse files
Files changed (4) hide show
  1. README.md +3 -3
  2. special_tokens_map.json +16 -4
  3. tokenizer_config.json +2 -2
  4. vocab.json +253 -252
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- license: cc-by-nc-4.0
3
  base_model: facebook/mms-1b-all
4
- tags:
5
- - generated_from_trainer
6
  metrics:
7
  - wer
 
 
8
  model-index:
9
  - name: wav2vec2-large-mms-1b-nepali
10
  results: []
 
1
  ---
 
2
  base_model: facebook/mms-1b-all
3
+ license: cc-by-nc-4.0
 
4
  metrics:
5
  - wer
6
+ tags:
7
+ - generated_from_trainer
8
  model-index:
9
  - name: wav2vec2-large-mms-1b-nepali
10
  results: []
special_tokens_map.json CHANGED
@@ -1,6 +1,18 @@
1
  {
2
- "bos_token": "<s>",
3
- "eos_token": "</s>",
4
- "pad_token": "[PAD]",
5
- "unk_token": "[UNK]"
 
 
 
 
 
 
 
 
 
 
 
 
6
  }
 
1
  {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "[pad]",
17
+ "unk_token": "[unk]"
18
  }
tokenizer_config.json CHANGED
@@ -38,10 +38,10 @@
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
  "model_max_length": 1000000000000000019884624838656,
41
- "pad_token": "[PAD]",
42
  "replace_word_delimiter_char": " ",
43
  "target_lang": "npi",
44
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
- "unk_token": "[UNK]",
46
  "word_delimiter_token": "|"
47
  }
 
38
  "do_lower_case": false,
39
  "eos_token": "</s>",
40
  "model_max_length": 1000000000000000019884624838656,
41
+ "pad_token": "[pad]",
42
  "replace_word_delimiter_char": " ",
43
  "target_lang": "npi",
44
  "tokenizer_class": "Wav2Vec2CTCTokenizer",
45
+ "unk_token": "[unk]",
46
  "word_delimiter_token": "|"
47
  }
vocab.json CHANGED
@@ -1,252 +1,253 @@
1
- {"eng": {
2
- "!": 75,
3
- "\"": 62,
4
- "$": 48,
5
- "%": 47,
6
- "&": 57,
7
- "'": 31,
8
- "+": 56,
9
- ",": 44,
10
- "-": 32,
11
- ".": 37,
12
- "/": 45,
13
- "0": 27,
14
- "1": 30,
15
- "2": 35,
16
- "3": 41,
17
- "4": 39,
18
- "5": 38,
19
- "6": 40,
20
- "7": 43,
21
- "8": 42,
22
- "9": 36,
23
- ":": 46,
24
- ";": 52,
25
- "</s>": 2,
26
- "<pad>": 0,
27
- "<s>": 1,
28
- "<unk>": 3,
29
- "[": 59,
30
- "]": 60,
31
- "`": 92,
32
- "a": 7,
33
- "b": 24,
34
- "c": 16,
35
- "d": 15,
36
- "e": 5,
37
- "f": 19,
38
- "g": 21,
39
- "h": 13,
40
- "i": 9,
41
- "j": 29,
42
- "k": 26,
43
- "l": 14,
44
- "m": 18,
45
- "n": 10,
46
- "o": 8,
47
- "p": 20,
48
- "q": 34,
49
- "r": 12,
50
- "s": 11,
51
- "t": 6,
52
- "u": 17,
53
- "v": 25,
54
- "w": 23,
55
- "x": 28,
56
- "y": 22,
57
- "z": 33,
58
- "|": 4,
59
- "\u00a3": 68,
60
- "\u00a5": 53,
61
- "\u00a7": 139,
62
- "\u00b0": 58,
63
- "\u00b2": 76,
64
- "\u00df": 82,
65
- "\u00e0": 84,
66
- "\u00e1": 49,
67
- "\u00e2": 81,
68
- "\u00e3": 66,
69
- "\u00e4": 78,
70
- "\u00e5": 123,
71
- "\u00e6": 110,
72
- "\u00e7": 64,
73
- "\u00e8": 79,
74
- "\u00e9": 55,
75
- "\u00ea": 88,
76
- "\u00eb": 87,
77
- "\u00ed": 50,
78
- "\u00ee": 97,
79
- "\u00ef": 85,
80
- "\u00f0": 98,
81
- "\u00f1": 83,
82
- "\u00f2": 105,
83
- "\u00f3": 67,
84
- "\u00f4": 86,
85
- "\u00f5": 69,
86
- "\u00f6": 74,
87
- "\u00f8": 91,
88
- "\u00fa": 70,
89
- "\u00fb": 99,
90
- "\u00fc": 54,
91
- "\u00fd": 112,
92
- "\u00fe": 132,
93
- "\u0101": 100,
94
- "\u0103": 102,
95
- "\u0107": 94,
96
- "\u010d": 89,
97
- "\u0113": 143,
98
- "\u0119": 119,
99
- "\u011b": 140,
100
- "\u011f": 153,
101
- "\u012b": 107,
102
- "\u0131": 103,
103
- "\u0142": 73,
104
- "\u0144": 118,
105
- "\u0148": 137,
106
- "\u014d": 80,
107
- "\u0151": 120,
108
- "\u0153": 96,
109
- "\u0159": 108,
110
- "\u015f": 116,
111
- "\u0161": 90,
112
- "\u016b": 101,
113
- "\u017e": 95,
114
- "\u0219": 104,
115
- "\u0259": 138,
116
- "\u02bb": 115,
117
- "\u0301": 93,
118
- "\u0307": 77,
119
- "\u03b1": 106,
120
- "\u03ba": 109,
121
- "\u03c0": 117,
122
- "\u03c7": 141,
123
- "\u0430": 144,
124
- "\u0432": 124,
125
- "\u0435": 125,
126
- "\u0437": 130,
127
- "\u0438": 113,
128
- "\u0439": 127,
129
- "\u043a": 114,
130
- "\u043b": 126,
131
- "\u043d": 128,
132
- "\u044c": 131,
133
- "\u044f": 129,
134
- "\u05e0": 147,
135
- "\u05e2": 148,
136
- "\u1e43": 121,
137
- "\u1ea1": 111,
138
- "\u1ea3": 145,
139
- "\u1ecb": 146,
140
- "\u1ee5": 122,
141
- "\u200b": 71,
142
- "\u2013": 61,
143
- "\u2014": 51,
144
- "\u2018": 72,
145
- "\u2019": 65,
146
- "\u201d": 63,
147
- "\u2261": 142,
148
- "\u4eac": 149,
149
- "\u5148": 135,
150
- "\u5927": 151,
151
- "\u5c1a": 134,
152
- "\u65f6": 133,
153
- "\u751f": 136,
154
- "\u90fd": 150,
155
- "\u962a": 152
156
- },
157
- "npi": {
158
- "(": 1,
159
- ")": 2,
160
- "/": 3,
161
- "[PAD]": 92,
162
- "[UNK]": 91,
163
- "a": 4,
164
- "b": 5,
165
- "c": 6,
166
- "e": 7,
167
- "f": 8,
168
- "k": 9,
169
- "o": 10,
170
- "|": 0,
171
- "": 11,
172
- "": 12,
173
- "": 13,
174
- "": 14,
175
- "": 15,
176
- "": 16,
177
- "": 17,
178
- "": 18,
179
- "": 19,
180
- "": 20,
181
- "": 21,
182
- "": 22,
183
- "": 23,
184
- "": 24,
185
- "": 25,
186
- "": 26,
187
- "": 27,
188
- "": 28,
189
- "": 29,
190
- "": 30,
191
- "": 31,
192
- "": 32,
193
- "": 33,
194
- "": 34,
195
- "": 35,
196
- "": 36,
197
- "": 37,
198
- "": 38,
199
- "": 39,
200
- "": 40,
201
- "": 41,
202
- "": 42,
203
- "": 43,
204
- "": 44,
205
- "": 45,
206
- "": 46,
207
- "": 47,
208
- "": 48,
209
- "": 49,
210
- "": 50,
211
- "": 51,
212
- "": 52,
213
- "": 53,
214
- "": 54,
215
- "": 55,
216
- "": 56,
217
- "": 57,
218
- "": 58,
219
- "": 59,
220
- "": 60,
221
- "ि": 61,
222
- "": 62,
223
- "": 63,
224
- "": 64,
225
- "": 65,
226
- "": 66,
227
- "": 67,
228
- "": 68,
229
- "": 69,
230
- "": 70,
231
- "": 71,
232
- "": 72,
233
- "": 73,
234
- "": 74,
235
- "": 75,
236
- "": 76,
237
- "": 77,
238
- "": 78,
239
- "": 79,
240
- "": 80,
241
- "": 81,
242
- "": 82,
243
- "": 83,
244
- "": 84,
245
- "": 85,
246
- "": 86,
247
- "": 87,
248
- "": 88,
249
- "": 89,
250
- "": 90
251
- }
252
- }
 
 
1
+ {
2
+ "eng": {
3
+ "!": 75,
4
+ "\"": 62,
5
+ "$": 48,
6
+ "%": 47,
7
+ "&": 57,
8
+ "'": 31,
9
+ "+": 56,
10
+ ",": 44,
11
+ "-": 32,
12
+ ".": 37,
13
+ "/": 45,
14
+ "0": 27,
15
+ "1": 30,
16
+ "2": 35,
17
+ "3": 41,
18
+ "4": 39,
19
+ "5": 38,
20
+ "6": 40,
21
+ "7": 43,
22
+ "8": 42,
23
+ "9": 36,
24
+ ":": 46,
25
+ ";": 52,
26
+ "</s>": 2,
27
+ "<pad>": 0,
28
+ "<s>": 1,
29
+ "<unk>": 3,
30
+ "[": 59,
31
+ "]": 60,
32
+ "`": 92,
33
+ "a": 7,
34
+ "b": 24,
35
+ "c": 16,
36
+ "d": 15,
37
+ "e": 5,
38
+ "f": 19,
39
+ "g": 21,
40
+ "h": 13,
41
+ "i": 9,
42
+ "j": 29,
43
+ "k": 26,
44
+ "l": 14,
45
+ "m": 18,
46
+ "n": 10,
47
+ "o": 8,
48
+ "p": 20,
49
+ "q": 34,
50
+ "r": 12,
51
+ "s": 11,
52
+ "t": 6,
53
+ "u": 17,
54
+ "v": 25,
55
+ "w": 23,
56
+ "x": 28,
57
+ "y": 22,
58
+ "z": 33,
59
+ "|": 4,
60
+ "£": 68,
61
+ "¥": 53,
62
+ "§": 139,
63
+ "°": 58,
64
+ "²": 76,
65
+ "ß": 82,
66
+ "à": 84,
67
+ "á": 49,
68
+ "â": 81,
69
+ "ã": 66,
70
+ "ä": 78,
71
+ "å": 123,
72
+ "æ": 110,
73
+ "ç": 64,
74
+ "è": 79,
75
+ "é": 55,
76
+ "ê": 88,
77
+ "ë": 87,
78
+ "í": 50,
79
+ "î": 97,
80
+ "ï": 85,
81
+ "ð": 98,
82
+ "ñ": 83,
83
+ "ò": 105,
84
+ "ó": 67,
85
+ "ô": 86,
86
+ "õ": 69,
87
+ "ö": 74,
88
+ "ø": 91,
89
+ "ú": 70,
90
+ "û": 99,
91
+ "ü": 54,
92
+ "ý": 112,
93
+ "þ": 132,
94
+ "ā": 100,
95
+ "ă": 102,
96
+ "ć": 94,
97
+ "č": 89,
98
+ "ē": 143,
99
+ "ę": 119,
100
+ "ě": 140,
101
+ "ğ": 153,
102
+ "ī": 107,
103
+ "ı": 103,
104
+ "ł": 73,
105
+ "ń": 118,
106
+ "ň": 137,
107
+ "ō": 80,
108
+ "ő": 120,
109
+ "œ": 96,
110
+ "ř": 108,
111
+ "ş": 116,
112
+ "š": 90,
113
+ "ū": 101,
114
+ "ž": 95,
115
+ "ș": 104,
116
+ "ə": 138,
117
+ "ʻ": 115,
118
+ "́": 93,
119
+ "̇": 77,
120
+ "α": 106,
121
+ "κ": 109,
122
+ "π": 117,
123
+ "χ": 141,
124
+ "а": 144,
125
+ "в": 124,
126
+ "е": 125,
127
+ "з": 130,
128
+ "и": 113,
129
+ "й": 127,
130
+ "к": 114,
131
+ "л": 126,
132
+ "н": 128,
133
+ "ь": 131,
134
+ "я": 129,
135
+ "נ": 147,
136
+ "ע": 148,
137
+ "": 121,
138
+ "": 111,
139
+ "": 145,
140
+ "": 146,
141
+ "": 122,
142
+ "": 71,
143
+ "": 61,
144
+ "": 51,
145
+ "": 72,
146
+ "": 65,
147
+ "": 63,
148
+ "": 142,
149
+ "": 149,
150
+ "": 135,
151
+ "": 151,
152
+ "": 134,
153
+ "": 133,
154
+ "": 136,
155
+ "": 150,
156
+ "阪": 152
157
+ },
158
+ "npi": {
159
+ "(": 1,
160
+ ")": 2,
161
+ "/": 3,
162
+ "[PAD]": 92,
163
+ "[UNK]": 91,
164
+ "a": 4,
165
+ "b": 5,
166
+ "c": 6,
167
+ "e": 7,
168
+ "f": 8,
169
+ "k": 9,
170
+ "o": 10,
171
+ "|": 0,
172
+ "": 11,
173
+ "": 12,
174
+ "": 13,
175
+ "": 14,
176
+ "": 15,
177
+ "": 16,
178
+ "": 17,
179
+ "": 18,
180
+ "": 19,
181
+ "": 20,
182
+ "": 21,
183
+ "": 22,
184
+ "": 23,
185
+ "": 24,
186
+ "": 25,
187
+ "": 26,
188
+ "": 27,
189
+ "": 28,
190
+ "": 29,
191
+ "": 30,
192
+ "": 31,
193
+ "": 32,
194
+ "": 33,
195
+ "": 34,
196
+ "": 35,
197
+ "": 36,
198
+ "": 37,
199
+ "": 38,
200
+ "": 39,
201
+ "": 40,
202
+ "": 41,
203
+ "": 42,
204
+ "": 43,
205
+ "": 44,
206
+ "": 45,
207
+ "": 46,
208
+ "": 47,
209
+ "": 48,
210
+ "": 49,
211
+ "": 50,
212
+ "": 51,
213
+ "": 52,
214
+ "": 53,
215
+ "": 54,
216
+ "": 55,
217
+ "": 56,
218
+ "": 57,
219
+ "": 58,
220
+ "": 59,
221
+ "": 60,
222
+ "ि": 61,
223
+ "": 62,
224
+ "": 63,
225
+ "": 64,
226
+ "": 65,
227
+ "": 66,
228
+ "": 67,
229
+ "": 68,
230
+ "": 69,
231
+ "": 70,
232
+ "": 71,
233
+ "": 72,
234
+ "": 73,
235
+ "": 74,
236
+ "": 75,
237
+ "": 76,
238
+ "": 77,
239
+ "": 78,
240
+ "": 79,
241
+ "": 80,
242
+ "": 81,
243
+ "": 82,
244
+ "": 83,
245
+ "": 84,
246
+ "": 85,
247
+ "": 86,
248
+ "": 87,
249
+ "": 88,
250
+ "": 89,
251
+ "’": 90
252
+ }
253
+ }