add tokenizer
Browse files- .gitignore +1 -0
- vocab.json +1 -1
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
checkpoint-*/
|
vocab.json
CHANGED
@@ -1 +1 @@
|
|
1 |
-
{"
|
|
|
1 |
+
{"ứ": 0, "đ": 1, "y": 2, "ả": 3, "ù": 4, "ĩ": 5, "h": 6, "ộ": 7, "e": 8, "è": 9, "ú": 10, "ồ": 11, "ỷ": 12, "ủ": 13, "é": 14, "ễ": 15, "ă": 16, "u": 17, "ệ": 18, "x": 19, "ô": 20, "ừ": 21, "ặ": 22, "ẹ": 23, "ổ": 24, "ẳ": 25, "ẻ": 26, "ỹ": 27, "ẩ": 28, "g": 29, "ề": 30, "ê": 31, "k": 32, "ẵ": 33, "ậ": 34, "c": 35, "ể": 36, "s": 37, "b": 38, "ụ": 39, "v": 40, "ờ": 41, "â": 42, "ử": 43, "w": 44, "ố": 45, "ớ": 46, "ấ": 47, "ỗ": 48, "í": 49, "t": 50, "l": 51, "ắ": 52, "ó": 53, "i": 54, "ế": 55, "ý": 56, "a": 57, "ỳ": 58, "ư": 59, "ợ": 60, "ị": 61, "ỏ": 62, "ở": 63, "ẽ": 64, "f": 65, "ự": 66, "m": 67, "ì": 68, "ạ": 69, "ỵ": 70, "ầ": 71, "ò": 72, "ã": 73, "ẫ": 74, "ọ": 75, "ỉ": 76, "p": 77, "o": 78, "q": 79, "ơ": 81, "ũ": 82, "ằ": 83, "n": 84, "õ": 85, "á": 86, "d": 87, "à": 88, "r": 89, "ữ": 90, "ỡ": 91, "z": 92, "j": 93, "|": 80, "[UNK]": 94, "[PAD]": 95}
|